From 0cc9e731104220c25d5e0869adf63df8894db6ce Mon Sep 17 00:00:00 2001 From: huseinzol05 <husein.zol05@gmail.com> Date: Wed, 1 Jun 2022 21:38:05 +0800 Subject: [PATCH] initial release 4.8 --- docs/index.rst | 2 + docs/load-jawi-rumi.ipynb | 303 ++++ docs/load-rumi-jawi.ipynb | 284 ++++ example/jawi-rumi/load-jawi-rumi.ipynb | 303 ++++ example/rumi-jawi/load-rumi-jawi.ipynb | 284 ++++ malaya/__init__.py | 6 +- malaya/jawi_rumi.py | 21 +- malaya/model/tf.py | 59 + malaya/rumi_jawi.py | 23 +- malaya/supervised/settings.py | 140 ++ malaya/supervised/transformer.py | 23 + malaya/text/function.py | 2 +- .../jawi-rumi/lstm-bahdanau-jawi-rumi.ipynb | 1346 ++++++++++++++++ session/rumi-jawi/lstm-bahdanau.ipynb | 1398 +++++++++++++++++ session/rumi-jawi/prepare-t2t-dev.ipynb | 537 +++++++ session/rumi-jawi/prepare-t2t-train.ipynb | 693 ++++++++ session/rumi-jawi/t2t_small.py | 240 +++ setup.py | 2 +- 18 files changed, 5649 insertions(+), 17 deletions(-) create mode 100644 docs/load-jawi-rumi.ipynb create mode 100644 docs/load-rumi-jawi.ipynb create mode 100644 example/jawi-rumi/load-jawi-rumi.ipynb create mode 100644 example/rumi-jawi/load-rumi-jawi.ipynb create mode 100644 session/jawi-rumi/lstm-bahdanau-jawi-rumi.ipynb create mode 100644 session/rumi-jawi/lstm-bahdanau.ipynb create mode 100644 session/rumi-jawi/prepare-t2t-dev.ipynb create mode 100644 session/rumi-jawi/prepare-t2t-train.ipynb create mode 100644 session/rumi-jawi/t2t_small.py diff --git a/docs/index.rst b/docs/index.rst index b350463e..3ee65436 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -63,6 +63,8 @@ Contents: :caption: Convert Module load-phoneme + load-rumi-jawi + load-jawi-rumi .. toctree:: :maxdepth: 2 diff --git a/docs/load-jawi-rumi.ipynb b/docs/load-jawi-rumi.ipynb new file mode 100644 index 00000000..47319a64 --- /dev/null +++ b/docs/load-jawi-rumi.ipynb @@ -0,0 +1,303 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Jawi-to-Rumi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This tutorial is available as an IPython notebook at [Malaya/example/jawi-rumi](https://github.com/huseinzol05/Malaya/tree/master/example/jawi-rumi).\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explanation\n", + "\n", + "Originally from https://www.ejawi.net/converterV2.php?go=rumi able to convert Rumi to Jawi using heuristic method. So Malaya convert from heuristic and map it using deep learning model by inverse the dataset.\n", + "\n", + "`چوميل` -> `comel`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.95 s, sys: 1.15 s, total: 7.1 s\n", + "Wall time: 9.05 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import malaya" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use deep learning model\n", + "\n", + "Load LSTM + Bahdanau Attention Jawi to Rumi model.\n", + "\n", + "If you are using Tensorflow 2, make sure Tensorflow Addons already installed,\n", + "\n", + "```bash\n", + "pip install tensorflow-addons U\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "def deep_model(quantized: bool = False, **kwargs):\n", + " \"\"\"\n", + " Load LSTM + Bahdanau Attention Rumi to Jawi model.\n", + " Original size 11MB, quantized size 2.92MB .\n", + " CER on test set: 0.09239719040982326\n", + " WER on test set: 0.33811816744187656\n", + "\n", + " Parameters\n", + " ----------\n", + " quantized : bool, optional (default=False)\n", + " if True, will load 8-bit quantized model.\n", + " Quantized model not necessary faster, totally depends on the machine.\n", + "\n", + " Returns\n", + " -------\n", + " result: malaya.model.tf.Seq2SeqLSTM class\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "530a47ea5c514ae9aa68c8a4e1e29d9c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=11034253.0, style=ProgressStyle(descrip…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "model = malaya.jawi_rumi.deep_model()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Quantized model\n", + "\n", + "To load 8-bit quantized model, simply pass `quantized = True`, default is `False`.\n", + "\n", + "We can expect slightly accuracy drop from quantized model, and not necessary faster than normal 32-bit float model, totally depends on machine." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Load quantized model will cause accuracy drop.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6d1d22a65abd48a28f9a1eb62f2d0c4d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2926859.0, style=ProgressStyle(descript…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "quantized_model = malaya.jawi_rumi.deep_model(quantized = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Predict\n", + "\n", + "```python\n", + "def predict(self, strings: List[str], beam_search: bool = False):\n", + " \"\"\"\n", + " Convert to target string.\n", + "\n", + " Parameters\n", + " ----------\n", + " strings : List[str]\n", + " beam_search : bool, (optional=False)\n", + " If True, use beam search decoder, else use greedy decoder.\n", + "\n", + " Returns\n", + " -------\n", + " result: List[str]\n", + " \"\"\"\n", + "```\n", + "\n", + "If want to speed up the inference, set `beam_search = False`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya suka makan im',\n", + " 'eak ack kotok',\n", + " 'aisuk berthday saya, jegan lupa bawak hadiah']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya suka makan im',\n", + " 'eak ack kotok',\n", + " 'aisuk berthday saya, jegan lopa bawak hadiah']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "quantized_model.predict(['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/load-rumi-jawi.ipynb b/docs/load-rumi-jawi.ipynb new file mode 100644 index 00000000..e316c977 --- /dev/null +++ b/docs/load-rumi-jawi.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Rumi-to-Jawi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This tutorial is available as an IPython notebook at [Malaya/example/rumi-jawi](https://github.com/huseinzol05/Malaya/tree/master/example/rumi-jawi).\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explanation\n", + "\n", + "Originally from https://www.ejawi.net/converterV2.php?go=rumi able to convert Rumi to Jawi using heuristic method. So Malaya convert from heuristic and map it using deep learning model.\n", + "\n", + "`comel` -> `چوميل`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.81 s, sys: 1.42 s, total: 8.23 s\n", + "Wall time: 10.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import malaya" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use deep learning model\n", + "\n", + "Load LSTM + Bahdanau Attention Rumi to Jawi model.\n", + "\n", + "If you are using Tensorflow 2, make sure Tensorflow Addons already installed,\n", + "\n", + "```bash\n", + "pip install tensorflow-addons U\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "def deep_model(quantized: bool = False, **kwargs):\n", + " \"\"\"\n", + " Load LSTM + Bahdanau Attention Rumi to Jawi model.\n", + " Original size 11MB, quantized size 2.92MB .\n", + " CER on test set: 0.014847105998349451\n", + " WER on test set: 0.06737832963079593\n", + "\n", + " Parameters\n", + " ----------\n", + " quantized : bool, optional (default=False)\n", + " if True, will load 8-bit quantized model.\n", + " Quantized model not necessary faster, totally depends on the machine.\n", + "\n", + " Returns\n", + " -------\n", + " result: malaya.model.tf.Seq2SeqLSTM class\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "model = malaya.rumi_jawi.deep_model()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Quantized model\n", + "\n", + "To load 8-bit quantized model, simply pass `quantized = True`, default is `False`.\n", + "\n", + "We can expect slightly accuracy drop from quantized model, and not necessary faster than normal 32-bit float model, totally depends on machine." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Load quantized model will cause accuracy drop.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bfbe7041190a428885bf0f5943f70bbc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2924259.0, style=ProgressStyle(descript…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "quantized_model = malaya.rumi_jawi.deep_model(quantized = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Predict\n", + "\n", + "```python\n", + "def predict(self, strings: List[str], beam_search: bool = False):\n", + " \"\"\"\n", + " Convert to target string.\n", + "\n", + " Parameters\n", + " ----------\n", + " strings : List[str]\n", + " beam_search : bool, (optional=False)\n", + " If True, use beam search decoder, else use greedy decoder.\n", + "\n", + " Returns\n", + " -------\n", + " result: List[str]\n", + " \"\"\"\n", + "```\n", + "\n", + "If want to speed up the inference, set `beam_search = False`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(['saya suka makan ayam', 'ayaq acaq kotoq', 'esok birthday saya, jgn lupa bawak hadiah'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq', 'esok birthday saya, jgn lupa bawak hadiah'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/example/jawi-rumi/load-jawi-rumi.ipynb b/example/jawi-rumi/load-jawi-rumi.ipynb new file mode 100644 index 00000000..47319a64 --- /dev/null +++ b/example/jawi-rumi/load-jawi-rumi.ipynb @@ -0,0 +1,303 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Jawi-to-Rumi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This tutorial is available as an IPython notebook at [Malaya/example/jawi-rumi](https://github.com/huseinzol05/Malaya/tree/master/example/jawi-rumi).\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explanation\n", + "\n", + "Originally from https://www.ejawi.net/converterV2.php?go=rumi able to convert Rumi to Jawi using heuristic method. So Malaya convert from heuristic and map it using deep learning model by inverse the dataset.\n", + "\n", + "`چوميل` -> `comel`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.95 s, sys: 1.15 s, total: 7.1 s\n", + "Wall time: 9.05 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import malaya" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use deep learning model\n", + "\n", + "Load LSTM + Bahdanau Attention Jawi to Rumi model.\n", + "\n", + "If you are using Tensorflow 2, make sure Tensorflow Addons already installed,\n", + "\n", + "```bash\n", + "pip install tensorflow-addons U\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "def deep_model(quantized: bool = False, **kwargs):\n", + " \"\"\"\n", + " Load LSTM + Bahdanau Attention Rumi to Jawi model.\n", + " Original size 11MB, quantized size 2.92MB .\n", + " CER on test set: 0.09239719040982326\n", + " WER on test set: 0.33811816744187656\n", + "\n", + " Parameters\n", + " ----------\n", + " quantized : bool, optional (default=False)\n", + " if True, will load 8-bit quantized model.\n", + " Quantized model not necessary faster, totally depends on the machine.\n", + "\n", + " Returns\n", + " -------\n", + " result: malaya.model.tf.Seq2SeqLSTM class\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "530a47ea5c514ae9aa68c8a4e1e29d9c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=11034253.0, style=ProgressStyle(descrip…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "model = malaya.jawi_rumi.deep_model()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Quantized model\n", + "\n", + "To load 8-bit quantized model, simply pass `quantized = True`, default is `False`.\n", + "\n", + "We can expect slightly accuracy drop from quantized model, and not necessary faster than normal 32-bit float model, totally depends on machine." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Load quantized model will cause accuracy drop.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6d1d22a65abd48a28f9a1eb62f2d0c4d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2926859.0, style=ProgressStyle(descript…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "quantized_model = malaya.jawi_rumi.deep_model(quantized = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Predict\n", + "\n", + "```python\n", + "def predict(self, strings: List[str], beam_search: bool = False):\n", + " \"\"\"\n", + " Convert to target string.\n", + "\n", + " Parameters\n", + " ----------\n", + " strings : List[str]\n", + " beam_search : bool, (optional=False)\n", + " If True, use beam search decoder, else use greedy decoder.\n", + "\n", + " Returns\n", + " -------\n", + " result: List[str]\n", + " \"\"\"\n", + "```\n", + "\n", + "If want to speed up the inference, set `beam_search = False`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya suka makan im',\n", + " 'eak ack kotok',\n", + " 'aisuk berthday saya, jegan lupa bawak hadiah']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya suka makan im',\n", + " 'eak ack kotok',\n", + " 'aisuk berthday saya, jegan lopa bawak hadiah']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "quantized_model.predict(['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/example/rumi-jawi/load-rumi-jawi.ipynb b/example/rumi-jawi/load-rumi-jawi.ipynb new file mode 100644 index 00000000..e316c977 --- /dev/null +++ b/example/rumi-jawi/load-rumi-jawi.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Rumi-to-Jawi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This tutorial is available as an IPython notebook at [Malaya/example/rumi-jawi](https://github.com/huseinzol05/Malaya/tree/master/example/rumi-jawi).\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div class=\"alert alert-info\">\n", + "\n", + "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n", + " \n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explanation\n", + "\n", + "Originally from https://www.ejawi.net/converterV2.php?go=rumi able to convert Rumi to Jawi using heuristic method. So Malaya convert from heuristic and map it using deep learning model.\n", + "\n", + "`comel` -> `چوميل`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.81 s, sys: 1.42 s, total: 8.23 s\n", + "Wall time: 10.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import malaya" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use deep learning model\n", + "\n", + "Load LSTM + Bahdanau Attention Rumi to Jawi model.\n", + "\n", + "If you are using Tensorflow 2, make sure Tensorflow Addons already installed,\n", + "\n", + "```bash\n", + "pip install tensorflow-addons U\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "def deep_model(quantized: bool = False, **kwargs):\n", + " \"\"\"\n", + " Load LSTM + Bahdanau Attention Rumi to Jawi model.\n", + " Original size 11MB, quantized size 2.92MB .\n", + " CER on test set: 0.014847105998349451\n", + " WER on test set: 0.06737832963079593\n", + "\n", + " Parameters\n", + " ----------\n", + " quantized : bool, optional (default=False)\n", + " if True, will load 8-bit quantized model.\n", + " Quantized model not necessary faster, totally depends on the machine.\n", + "\n", + " Returns\n", + " -------\n", + " result: malaya.model.tf.Seq2SeqLSTM class\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "model = malaya.rumi_jawi.deep_model()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Quantized model\n", + "\n", + "To load 8-bit quantized model, simply pass `quantized = True`, default is `False`.\n", + "\n", + "We can expect slightly accuracy drop from quantized model, and not necessary faster than normal 32-bit float model, totally depends on machine." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Load quantized model will cause accuracy drop.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bfbe7041190a428885bf0f5943f70bbc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2924259.0, style=ProgressStyle(descript…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "quantized_model = malaya.rumi_jawi.deep_model(quantized = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Predict\n", + "\n", + "```python\n", + "def predict(self, strings: List[str], beam_search: bool = False):\n", + " \"\"\"\n", + " Convert to target string.\n", + "\n", + " Parameters\n", + " ----------\n", + " strings : List[str]\n", + " beam_search : bool, (optional=False)\n", + " If True, use beam search decoder, else use greedy decoder.\n", + "\n", + " Returns\n", + " -------\n", + " result: List[str]\n", + " \"\"\"\n", + "```\n", + "\n", + "If want to speed up the inference, set `beam_search = False`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(['saya suka makan ayam', 'ayaq acaq kotoq', 'esok birthday saya, jgn lupa bawak hadiah'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ساي سوك ماكن ايم', 'اياق اچق كوتوق', 'ايسوق بيرثداي ساي، جڬن لوڤا باوق هديه']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq', 'esok birthday saya, jgn lupa bawak hadiah'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/malaya/__init__.py b/malaya/__init__.py index 40f15565..c0727399 100644 --- a/malaya/__init__.py +++ b/malaya/__init__.py @@ -8,8 +8,8 @@ from malaya_boilerplate.utils import get_home -version = '4.7' -bump_version = '4.7.5' +version = '4.8' +bump_version = '4.8.0' __version__ = bump_version package = 'malaya' @@ -25,6 +25,7 @@ from . import emotion from . import entity from . import generator +from . import jawi_rumi from . import keyword_extraction from . import knowledge_graph from . import language_detection @@ -38,6 +39,7 @@ from . import preprocessing from . import qa from . import relevancy +from . import rumi_jawi from . import segmentation from . import sentiment from . import similarity diff --git a/malaya/jawi_rumi.py b/malaya/jawi_rumi.py index 81acc2ca..12d17730 100644 --- a/malaya/jawi_rumi.py +++ b/malaya/jawi_rumi.py @@ -1,12 +1,17 @@ +from malaya.supervised import t2t +from malaya.supervised.settings import jawi_left, jawi_right +from malaya.text.function import rumi_jawi_textcleaning from herpetologist import check_type from typing import List @check_type -def deep_model(quantized=False, **kwargs): +def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention Rumi to Jawi model. - Original size 41.6MB, quantized size 10.6MB . + Original size 11MB, quantized size 2.92MB . + CER on test set: 0.09239719040982326 + WER on test set: 0.33811816744187656 Parameters ---------- @@ -18,7 +23,11 @@ def deep_model(quantized=False, **kwargs): ------- result: malaya.model.tf.Seq2SeqLSTM class """ - - -def transformer(model='small', quantized=False, **kwargs): - pass + return t2t.load_lstm( + module='jawi-rumi', + left_dict=jawi_right, + right_dict=jawi_left, + cleaning=rumi_jawi_textcleaning, + quantized=quantized, + **kwargs, + ) diff --git a/malaya/model/tf.py b/malaya/model/tf.py index f4c0d205..cda4daf0 100644 --- a/malaya/model/tf.py +++ b/malaya/model/tf.py @@ -1142,3 +1142,62 @@ def predict(self, strings: List[str], beam_search: bool = False): results = [''.join([self._rev_right_dict[i] for i in r if i > 3]) for r in v] return results + + +class TransformerChar: + def __init__(self, input_nodes, output_nodes, sess, left_dict, cleaning, **kwargs): + self._input_nodes = input_nodes + self._output_nodes = output_nodes + self._sess = sess + self._left_dict = left_dict + self._cleaning = cleaning + self._rev_left_dict = {v: k for k, v in self._left_dict.items()} + + def _predict(self, strings, beam_search=True): + encoded = [[self._left_dict[c] for c in self._cleaning(string, self._left_dict)] + [1] for string in strings] + batch_x = pad_sentence_batch(encoded, 0)[0] + + if beam_search: + output = 'beam' + else: + output = 'greedy' + + r = self._execute( + inputs=[batch_x], + input_labels=['Placeholder'], + output_labels=[output], + ) + v = r[output] + results = [''.join([self._rev_left_dict[i] for i in r if i > 3]) for r in v] + + return results + + @check_type + def greedy_decoder(self, strings: List[str]): + """ + Convert strings using greedy decoder. + + Parameters + ---------- + strings : List[str] + + Returns + ------- + result: List[str] + """ + return self._predict(strings=strings, beam_search=False) + + @check_type + def beam_decoder(self, strings: List[str]): + """ + Convert strings using beam decoder, beam width size 3, alpha 0.5 . + + Parameters + ---------- + strings : List[str] + + Returns + ------- + result: List[str] + """ + return self._predict(strings=strings, beam_search=True) diff --git a/malaya/rumi_jawi.py b/malaya/rumi_jawi.py index fed4ecb0..6153587e 100644 --- a/malaya/rumi_jawi.py +++ b/malaya/rumi_jawi.py @@ -1,12 +1,17 @@ +from malaya.supervised import t2t +from malaya.supervised.settings import jawi_left, jawi_right +from malaya.text.function import rumi_jawi_textcleaning from herpetologist import check_type from typing import List @check_type -def deep_model(quantized=False, **kwargs): +def deep_model(quantized: bool = False, **kwargs): """ Load LSTM + Bahdanau Attention Rumi to Jawi model. - Original size 41.6MB, quantized size 10.6MB . + Original size 11MB, quantized size 2.92MB . + CER on test set: 0.014847105998349451 + WER on test set: 0.06737832963079593 Parameters ---------- @@ -16,9 +21,13 @@ def deep_model(quantized=False, **kwargs): Returns ------- - result: malaya.jawi.DeepJawi class + result: malaya.model.tf.Seq2SeqLSTM class """ - - -def transformer(model='small', quantized=False, **kwargs): - pass + return t2t.load_lstm( + module='rumi-jawi', + left_dict=jawi_left, + right_dict=jawi_right, + cleaning=rumi_jawi_textcleaning, + quantized=quantized, + **kwargs, + ) diff --git a/malaya/supervised/settings.py b/malaya/supervised/settings.py index 6d55e3cc..52624d85 100644 --- a/malaya/supervised/settings.py +++ b/malaya/supervised/settings.py @@ -181,3 +181,143 @@ 0: 0, 1: 1, 2: 2, 3: 3, ' ': 4, '!': 5, '"': 6, '-': 7, '.': 8, ':': 9, ';': 10, '،': 11, '؟': 12, 'ء': 13, 'آ': 14, 'أ': 15, 'ؤ': 16, 'إ': 17, 'ئ': 18, 'ا': 19, 'ب': 20, 'ة': 21, 'ت': 22, 'ث': 23, 'ج': 24, 'ح': 25, 'خ': 26, 'د': 27, 'ذ': 28, 'ر': 29, 'ز': 30, 'س': 31, 'ش': 32, 'ص': 33, 'ض': 34, 'ط': 35, 'ظ': 36, 'ع': 37, 'غ': 38, 'ف': 39, 'ق': 40, 'ك': 41, 'ل': 42, 'م': 43, 'ن': 44, 'ه': 45, 'و': 46, 'ى': 47, 'ي': 48, 'ّ': 49, 'ٓ': 50, '٠': 51, '١': 52, '٢': 53, '٣': 54, '٤': 55, '٥': 56, '٦': 57, '٧': 58, '٨': 59, '٩': 60, 'چ': 61, 'ڠ': 62, 'ڤ': 63, 'ڬ': 64, 'ڽ': 65, 'ۏ': 66, '﴾': 67, '﴿': 68 } + +jawi_t2t = { + 0: 0, + 1: 1, + 2: 2, + 3: 3, + ' ': 4, + '!': 5, + '"': 6, + "'": 7, + '(': 8, + ')': 9, + '+': 10, + ',': 11, + '-': 12, + '.': 13, + '0': 14, + '1': 15, + '2': 16, + '3': 17, + '4': 18, + '5': 19, + '6': 20, + '7': 21, + '8': 22, + '9': 23, + ':': 24, + ';': 25, + '?': 26, + 'A': 27, + 'B': 28, + 'C': 29, + 'D': 30, + 'E': 31, + 'F': 32, + 'G': 33, + 'H': 34, + 'I': 35, + 'J': 36, + 'K': 37, + 'L': 38, + 'M': 39, + 'N': 40, + 'O': 41, + 'P': 42, + 'Q': 43, + 'R': 44, + 'S': 45, + 'T': 46, + 'U': 47, + 'V': 48, + 'W': 49, + 'X': 50, + 'Y': 51, + 'Z': 52, + 'a': 53, + 'b': 54, + 'c': 55, + 'd': 56, + 'e': 57, + 'f': 58, + 'g': 59, + 'h': 60, + 'i': 61, + 'j': 62, + 'k': 63, + 'l': 64, + 'm': 65, + 'n': 66, + 'o': 67, + 'p': 68, + 'q': 69, + 'r': 70, + 's': 71, + 't': 72, + 'u': 73, + 'v': 74, + 'w': 75, + 'x': 76, + 'y': 77, + 'z': 78, + '،': 79, + '؟': 80, + 'ء': 81, + 'آ': 82, + 'أ': 83, + 'ؤ': 84, + 'إ': 85, + 'ئ': 86, + 'ا': 87, + 'ب': 88, + 'ة': 89, + 'ت': 90, + 'ث': 91, + 'ج': 92, + 'ح': 93, + 'خ': 94, + 'د': 95, + 'ذ': 96, + 'ر': 97, + 'ز': 98, + 'س': 99, + 'ش': 100, + 'ص': 101, + 'ض': 102, + 'ط': 103, + 'ظ': 104, + 'ع': 105, + 'غ': 106, + 'ف': 107, + 'ق': 108, + 'ك': 109, + 'ل': 110, + 'م': 111, + 'ن': 112, + 'ه': 113, + 'و': 114, + 'ى': 115, + 'ي': 116, + 'ّ': 117, + 'ٓ': 118, + '٠': 119, + '١': 120, + '٢': 121, + '٣': 122, + '٤': 123, + '٥': 124, + '٦': 125, + '٧': 126, + '٨': 127, + '٩': 128, + 'چ': 129, + 'ڠ': 130, + 'ڤ': 131, + 'ڬ': 132, + 'ڽ': 133, + 'ۏ': 134, + '﴾': 135, + '﴿': 136 +} diff --git a/malaya/supervised/transformer.py b/malaya/supervised/transformer.py index a3d76efd..ad1fd93d 100644 --- a/malaya/supervised/transformer.py +++ b/malaya/supervised/transformer.py @@ -11,6 +11,7 @@ ) from malaya.preprocessing import Tokenizer from malaya.text.t2t import text_encoder +from malaya.model.tf import TransformerChar from malaya.path import T2T_BPE_MODEL, LM_VOCAB ENCODER_MODEL = { @@ -92,3 +93,25 @@ def load_tatabahasa(module, model, model_class, quantized=False, **kwargs): tokenizer=tokenizer, word_tokenizer=word_tokenizer, ) + + +def load_char(module, model, left_dict, cleaning, quantized=False, **kwargs): + path = check_file( + file=model, + module=module, + keys={'model': 'model.pb'}, + quantized=quantized, + **kwargs, + ) + g = load_graph(path['model'], **kwargs) + inputs = ['x_placeholder'] + outputs = ['greedy', 'tag_greedy'] + input_nodes, output_nodes = nodes_session(g, inputs, outputs) + + return TransformerChar( + input_nodes=input_nodes, + output_nodes=output_nodes, + sess=generate_session(graph=g, **kwargs), + left_dict=left_dict, + cleaning=cleaning, + ) diff --git a/malaya/text/function.py b/malaya/text/function.py index f6beaa06..d467550e 100644 --- a/malaya/text/function.py +++ b/malaya/text/function.py @@ -428,7 +428,7 @@ def phoneme_textcleaning(string, dict, replace_chars='.,!?['): def rumi_jawi_textcleaning(string, dict): - l = ''.join([c for c in l if c in dict]) + l = ''.join([c for c in string if c in dict]) return re.sub(r'[ ]+', ' ', l).strip() diff --git a/session/jawi-rumi/lstm-bahdanau-jawi-rumi.ipynb b/session/jawi-rumi/lstm-bahdanau-jawi-rumi.ipynb new file mode 100644 index 00000000..8ea22595 --- /dev/null +++ b/session/jawi-rumi/lstm-bahdanau-jawi-rumi.ipynb @@ -0,0 +1,1346 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a3844223", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "99fff94b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-01 18:47:41.589204: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n" + ] + } + ], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "import json\n", + "from glob import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "53dc44e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0, 1, 2, 3]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PAD = 0\n", + "EOS = 1\n", + "UNK = 2\n", + "GO = 3\n", + "[PAD, EOS, UNK, GO]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "98c4fe31", + "metadata": {}, + "outputs": [], + "source": [ + "left_dict = {0: 0,\n", + " 1: 1,\n", + " 2: 2,\n", + " 3: 3,\n", + " ' ': 4,\n", + " '!': 5,\n", + " '\"': 6,\n", + " \"'\": 7,\n", + " '(': 8,\n", + " ')': 9,\n", + " '+': 10,\n", + " ',': 11,\n", + " '-': 12,\n", + " '.': 13,\n", + " '0': 14,\n", + " '1': 15,\n", + " '2': 16,\n", + " '3': 17,\n", + " '4': 18,\n", + " '5': 19,\n", + " '6': 20,\n", + " '7': 21,\n", + " '8': 22,\n", + " '9': 23,\n", + " ':': 24,\n", + " ';': 25,\n", + " '?': 26,\n", + " 'A': 27,\n", + " 'B': 28,\n", + " 'C': 29,\n", + " 'D': 30,\n", + " 'E': 31,\n", + " 'F': 32,\n", + " 'G': 33,\n", + " 'H': 34,\n", + " 'I': 35,\n", + " 'J': 36,\n", + " 'K': 37,\n", + " 'L': 38,\n", + " 'M': 39,\n", + " 'N': 40,\n", + " 'O': 41,\n", + " 'P': 42,\n", + " 'Q': 43,\n", + " 'R': 44,\n", + " 'S': 45,\n", + " 'T': 46,\n", + " 'U': 47,\n", + " 'V': 48,\n", + " 'W': 49,\n", + " 'X': 50,\n", + " 'Y': 51,\n", + " 'Z': 52,\n", + " 'a': 53,\n", + " 'b': 54,\n", + " 'c': 55,\n", + " 'd': 56,\n", + " 'e': 57,\n", + " 'f': 58,\n", + " 'g': 59,\n", + " 'h': 60,\n", + " 'i': 61,\n", + " 'j': 62,\n", + " 'k': 63,\n", + " 'l': 64,\n", + " 'm': 65,\n", + " 'n': 66,\n", + " 'o': 67,\n", + " 'p': 68,\n", + " 'q': 69,\n", + " 'r': 70,\n", + " 's': 71,\n", + " 't': 72,\n", + " 'u': 73,\n", + " 'v': 74,\n", + " 'w': 75,\n", + " 'x': 76,\n", + " 'y': 77,\n", + " 'z': 78}\n", + "rev_left_dict = {v: k for k, v in left_dict.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8cb15445", + "metadata": {}, + "outputs": [], + "source": [ + "right_dict = {0: 0,\n", + " 1: 1,\n", + " 2: 2,\n", + " 3: 3,\n", + " ' ': 4,\n", + " '!': 5,\n", + " '\"': 6,\n", + " '-': 7,\n", + " '.': 8,\n", + " ':': 9,\n", + " ';': 10,\n", + " '،': 11,\n", + " '؟': 12,\n", + " 'ء': 13,\n", + " 'آ': 14,\n", + " 'أ': 15,\n", + " 'ؤ': 16,\n", + " 'إ': 17,\n", + " 'ئ': 18,\n", + " 'ا': 19,\n", + " 'ب': 20,\n", + " 'ة': 21,\n", + " 'ت': 22,\n", + " 'ث': 23,\n", + " 'ج': 24,\n", + " 'ح': 25,\n", + " 'خ': 26,\n", + " 'د': 27,\n", + " 'ذ': 28,\n", + " 'ر': 29,\n", + " 'ز': 30,\n", + " 'س': 31,\n", + " 'ش': 32,\n", + " 'ص': 33,\n", + " 'ض': 34,\n", + " 'ط': 35,\n", + " 'ظ': 36,\n", + " 'ع': 37,\n", + " 'غ': 38,\n", + " 'ف': 39,\n", + " 'ق': 40,\n", + " 'ك': 41,\n", + " 'ل': 42,\n", + " 'م': 43,\n", + " 'ن': 44,\n", + " 'ه': 45,\n", + " 'و': 46,\n", + " 'ى': 47,\n", + " 'ي': 48,\n", + " 'ّ': 49,\n", + " 'ٓ': 50,\n", + " '٠': 51,\n", + " '١': 52,\n", + " '٢': 53,\n", + " '٣': 54,\n", + " '٤': 55,\n", + " '٥': 56,\n", + " '٦': 57,\n", + " '٧': 58,\n", + " '٨': 59,\n", + " '٩': 60,\n", + " 'چ': 61,\n", + " 'ڠ': 62,\n", + " 'ڤ': 63,\n", + " 'ڬ': 64,\n", + " 'ڽ': 65,\n", + " 'ۏ': 66,\n", + " '﴾': 67,\n", + " '﴿': 68}\n", + "rev_right_dict = {v: k for k, v in right_dict.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "407e1aab", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['كاوسن كڤولاوان سڤراتلي يڠ',\n", + " 'ڤرليمين ڤرسكوتوان اونتوق',\n", + " 'ڤنوبوهن تامن سينر',\n", + " 'ڤريڠكت كمنترين، كاتڽ.',\n", + " 'تله مندرم سباڽق تيڬ',\n", + " 'هاري اين،',\n", + " 'برتوليرنسي\"',\n", + " 'مڠيسهكان',\n", + " 'سوڠ-قواڠ',\n", + " 'سيبير دڠن باجو بياسا.']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open('jawi-set.json') as fopen:\n", + " jawi = json.load(fopen)\n", + "jawi['train'][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1cb0953d", + "metadata": {}, + "outputs": [], + "source": [ + "with open('rumi-set.json') as fopen:\n", + " rumi = json.load(fopen)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0783c43c", + "metadata": {}, + "outputs": [], + "source": [ + "train_X = jawi['train']\n", + "train_Y = rumi['train']\n", + "\n", + "test_X = jawi['test']\n", + "test_Y = rumi['test']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0de00f6f", + "metadata": {}, + "outputs": [], + "source": [ + "class Translator:\n", + " def __init__(self, size_layer, num_layers, embedded_size,\n", + " from_dict_size, to_dict_size, learning_rate, beam_width = 10):\n", + " \n", + " def cells(reuse=False):\n", + " return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)\n", + " \n", + " def attention(encoder_out, seq_len, reuse=False):\n", + " attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units = size_layer, \n", + " memory = encoder_out,\n", + " memory_sequence_length = seq_len)\n", + " return tf.contrib.seq2seq.AttentionWrapper(\n", + " cell = tf.nn.rnn_cell.MultiRNNCell([cells(reuse) for _ in range(num_layers)]), \n", + " attention_mechanism = attention_mechanism,\n", + " attention_layer_size = size_layer)\n", + " \n", + " self.X = tf.placeholder(tf.int32, [None, None])\n", + " self.Y = tf.placeholder(tf.int32, [None, None])\n", + " self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)\n", + " self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)\n", + " batch_size = tf.shape(self.X)[0]\n", + " \n", + " encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))\n", + " decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))\n", + " \n", + " encoder_out, encoder_state = tf.nn.dynamic_rnn(\n", + " cell = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), \n", + " inputs = tf.nn.embedding_lookup(encoder_embedding, self.X),\n", + " sequence_length = self.X_seq_len,\n", + " dtype = tf.float32)\n", + " main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])\n", + " decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)\n", + " dense = tf.layers.Dense(to_dict_size)\n", + " \n", + " with tf.variable_scope('decode'):\n", + " decoder_cells = attention(encoder_out, self.X_seq_len)\n", + " training_helper = tf.contrib.seq2seq.TrainingHelper(\n", + " inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_input),\n", + " sequence_length = self.Y_seq_len,\n", + " time_major = False)\n", + " training_decoder = tf.contrib.seq2seq.BasicDecoder(\n", + " cell = decoder_cells,\n", + " helper = training_helper,\n", + " initial_state = decoder_cells.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),\n", + " output_layer = dense)\n", + " training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(\n", + " decoder = training_decoder,\n", + " impute_finished = True,\n", + " maximum_iterations = tf.reduce_max(self.Y_seq_len))\n", + " self.training_logits = training_decoder_output.rnn_output\n", + " \n", + " with tf.variable_scope('decode', reuse=True):\n", + " predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(\n", + " embedding = decoder_embedding,\n", + " start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),\n", + " end_token = EOS)\n", + " predicting_decoder = tf.contrib.seq2seq.BasicDecoder(\n", + " cell = decoder_cells,\n", + " helper = predicting_helper,\n", + " initial_state = decoder_cells.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),\n", + " output_layer = dense)\n", + " predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(\n", + " decoder = predicting_decoder,\n", + " impute_finished = True,\n", + " maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))\n", + " self.greedy = predicting_decoder_output.sample_id\n", + " self.greedy = tf.identity(self.greedy,name='greedy')\n", + " \n", + " with tf.variable_scope('decode', reuse=True):\n", + " \n", + " encoder_out_tiled = tf.contrib.seq2seq.tile_batch(encoder_out, beam_width)\n", + " encoder_state_tiled = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)\n", + " X_seq_len_tiled = tf.contrib.seq2seq.tile_batch(self.X_seq_len, beam_width)\n", + " decoder_cell = attention(encoder_out_tiled, X_seq_len_tiled, reuse=True)\n", + " \n", + " predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(\n", + " cell = decoder_cell,\n", + " embedding = decoder_embedding,\n", + " start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),\n", + " end_token = EOS,\n", + " initial_state = decoder_cell.zero_state(batch_size * beam_width, tf.float32).clone(\n", + " cell_state = encoder_state_tiled),\n", + " beam_width = beam_width,\n", + " output_layer = dense,\n", + " length_penalty_weight = 0.0)\n", + " \n", + " predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(\n", + " decoder = predicting_decoder,\n", + " impute_finished = False,\n", + " maximum_iterations = tf.reduce_max(self.X_seq_len))\n", + " \n", + " self.beam = predicting_decoder_output.predicted_ids[:, :, 0]\n", + " self.beam = tf.identity(self.beam,name='beam')\n", + " \n", + " masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)\n", + " self.masks = masks\n", + " self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,\n", + " targets = self.Y,\n", + " weights = masks)\n", + " self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)\n", + " y_t = tf.argmax(self.training_logits,axis=2)\n", + " y_t = tf.cast(y_t, tf.int32)\n", + " self.prediction = tf.boolean_mask(y_t, masks)\n", + " mask_label = tf.boolean_mask(self.Y, masks)\n", + " correct_pred = tf.equal(self.prediction, mask_label)\n", + " correct_index = tf.cast(correct_pred, tf.float32)\n", + " self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7197fd33", + "metadata": {}, + "outputs": [], + "source": [ + "size_layer = 256\n", + "num_layers = 2\n", + "embedded_size = 256\n", + "learning_rate = 1e-3\n", + "batch_size = 32\n", + "epoch = 20" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "3a034214", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", + " warnings.warn('An interactive session is already active. This can '\n" + ] + } + ], + "source": [ + "tf.reset_default_graph()\n", + "sess = tf.InteractiveSession()\n", + "model = Translator(size_layer, num_layers, embedded_size, len(right_dict), len(left_dict), learning_rate)\n", + "sess.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "cfeb1357", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'lstm-jawi-rumi/model.ckpt'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "saver = tf.train.Saver(tf.trainable_variables())\n", + "saver.save(sess, 'lstm-jawi-rumi/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4cd7109c", + "metadata": {}, + "outputs": [], + "source": [ + "def pad_sentence_batch(sentence_batch, pad_int):\n", + " padded_seqs = []\n", + " seq_lens = []\n", + " max_sentence_len = max([len(sentence) for sentence in sentence_batch])\n", + " for sentence in sentence_batch:\n", + " padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))\n", + " seq_lens.append(len(sentence))\n", + " return padded_seqs, seq_lens" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a44d95fc", + "metadata": {}, + "outputs": [], + "source": [ + "i = 0\n", + "batch_x = [[right_dict[c] for c in s] + [1] for s in train_X[i: i + 5]]\n", + "batch_y = [[left_dict[c] for c in s] + [1] for s in train_Y[i: i + 5]]\n", + "batch_x, _ = pad_sentence_batch(batch_x, 0)\n", + "batch_y, _ = pad_sentence_batch(batch_y, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e44c2c00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 32)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.array(batch_y).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "65f1ab08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 26)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.array(batch_x).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "67f53b4e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 750 ms, sys: 107 ms, total: 857 ms\n", + "Wall time: 288 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "[4.3786893, None]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "sess.run([model.cost, model.optimizer], feed_dict = {model.X: batch_x, model.Y: batch_y})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ff00eb04", + "metadata": {}, + "outputs": [], + "source": [ + "greedy, beam = sess.run([model.greedy, model.beam], feed_dict = {model.X: batch_x[:1]})" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6ae9aaba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([[53, 53, 53, 53, 53, 53, 53, 66, 53, 53, 53, 53, 53, 53, 53, 53,\n", + " 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,\n", + " 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,\n", + " 53, 53, 53, 53]], dtype=int32),\n", + " array([[53, 53, 53, 53, 66, 53, 53, 53, 66, 53, 53, 53, 66, 53, 53, 53,\n", + " 53, 53, 66, 53, 53, 53, 53, 53, 53, 53]], dtype=int32))" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "greedy, beam" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ec0d3145", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "minibatch loop: 91%|█████████▏| 14227/15566 [53:43<04:54, 4.55it/s, accuracy=0.941, cost=0.173] IOPub message rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_msg_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n", + "minibatch loop: 15%|█▍ | 2291/15566 [08:45<54:14, 4.08it/s, accuracy=0.957, cost=0.134] IOPub message rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_msg_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n", + "minibatch loop: 100%|██████████| 15566/15566 [59:11<00:00, 4.38it/s, accuracy=0.967, cost=0.0894]\n", + "minibatch loop: 100%|██████████| 1730/1730 [01:52<00:00, 15.35it/s, accuracy=0.951, cost=0.145] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch 2, training avg loss 0.120298, training avg acc 0.957848\n", + "epoch 2, testing avg loss 0.115312, testing avg acc 0.959801\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import tqdm\n", + "from sklearn.utils import shuffle\n", + "\n", + "for e in range(2):\n", + " train_X, train_Y = shuffle(train_X, train_Y)\n", + " pbar = tqdm.tqdm(\n", + " range(0, len(train_X), batch_size), desc = 'minibatch loop')\n", + " train_loss, train_acc, test_loss, test_acc = [], [], [], []\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(train_X))\n", + " batch_x = [[right_dict[c] for c in s] + [1] for s in train_X[i: index]]\n", + " batch_y = [[left_dict[c] for c in s] + [1] for s in train_Y[i: index]]\n", + " batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + " batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + " feed = {model.X: batch_x,\n", + " model.Y: batch_y}\n", + " accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],\n", + " feed_dict = feed)\n", + " train_loss.append(loss)\n", + " train_acc.append(accuracy)\n", + " pbar.set_postfix(cost = loss, accuracy = accuracy)\n", + " \n", + " pbar = tqdm.tqdm(\n", + " range(0, len(test_X), batch_size), desc = 'minibatch loop')\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(test_X))\n", + " batch_x = [[right_dict[c] for c in s] + [1] for s in test_X[i: index]]\n", + " batch_y = [[left_dict[c] for c in s] + [1] for s in test_Y[i: index]]\n", + " batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + " batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + " feed = {model.X: batch_x,\n", + " model.Y: batch_y,}\n", + " accuracy, loss = sess.run([model.accuracy,model.cost],\n", + " feed_dict = feed)\n", + "\n", + " test_loss.append(loss)\n", + " test_acc.append(accuracy)\n", + " pbar.set_postfix(cost = loss, accuracy = accuracy)\n", + " \n", + " print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,\n", + " np.mean(train_loss),np.mean(train_acc)))\n", + " print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,\n", + " np.mean(test_loss),np.mean(test_acc)))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "6ad3a562", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'lstm-bahdanau-jawi-rumi/model.ckpt'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "saver = tf.train.Saver(tf.trainable_variables())\n", + "saver.save(sess, 'lstm-bahdanau-jawi-rumi/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "15f222d2", + "metadata": {}, + "outputs": [], + "source": [ + "string = 'هيدوڤ اين ساڠت له اينده ١٢٣'\n", + "batch = [right_dict[c] for c in string] + [1]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b8aa7b02", + "metadata": {}, + "outputs": [], + "source": [ + "greedy, beam = sess.run([model.greedy, model.beam], feed_dict = {model.X: [batch]})" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "18a45fc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([[60, 61, 56, 73, 68, 4, 61, 66, 61, 4, 71, 53, 66, 59, 53, 72,\n", + " 4, 64, 53, 60, 4, 61, 66, 56, 53, 60, 4, 15, 16, 17, 1]],\n", + " dtype=int32),\n", + " array([[60, 61, 56, 73, 68, 4, 61, 66, 61, 4, 71, 53, 66, 59, 53, 72,\n", + " 4, 64, 53, 60, 4, 61, 66, 56, 53, 60, 4, 15]], dtype=int32))" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "greedy, beam" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "64fdbd7d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'hidup ini sangat lah indah 123'" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "''.join([rev_left_dict[i] for i in greedy[0] if i > 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "3f836ac6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_417540/3818740975.py:4: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "['Placeholder',\n", + " 'Placeholder_1',\n", + " 'Variable',\n", + " 'Variable_1',\n", + " 'rnn/multi_rnn_cell/cell_0/lstm_cell/kernel',\n", + " 'rnn/multi_rnn_cell/cell_0/lstm_cell/bias',\n", + " 'rnn/multi_rnn_cell/cell_1/lstm_cell/kernel',\n", + " 'rnn/multi_rnn_cell/cell_1/lstm_cell/bias',\n", + " 'decode/memory_layer/kernel',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/kernel',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/bias',\n", + " 'decode/decoder/attention_wrapper/bahdanau_attention/query_layer/kernel',\n", + " 'decode/decoder/attention_wrapper/bahdanau_attention/attention_v',\n", + " 'decode/decoder/attention_wrapper/attention_layer/kernel',\n", + " 'decode/decoder/dense/kernel',\n", + " 'decode/decoder/dense/bias',\n", + " 'decode_1/greedy',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/beam_width',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/range/start',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/range/delta',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/range',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/mul/y',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/mul',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/ExpandDims/dim',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/ExpandDims',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/add',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape/shape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape_1/shape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape_1',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/GatherV2/axis',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/GatherV2',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Shape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice/stack',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice/stack_1',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice/stack_2',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/output',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_word_ids/y',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_word_ids',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_parent_ids',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_finished',\n", + " 'decode_2/beam']" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "strings = ','.join(\n", + " [\n", + " n.name\n", + " for n in tf.get_default_graph().as_graph_def().node\n", + " if ('Variable' in n.op\n", + " or 'Placeholder' in n.name\n", + " or 'greedy' in n.name\n", + " or 'beam' in n.name\n", + " or 'alphas' in n.name)\n", + " and 'Adam' not in n.name\n", + " and 'beta' not in n.name\n", + " and 'OptimizeLoss' not in n.name\n", + " and 'Global_Step' not in n.name\n", + " ]\n", + ")\n", + "strings.split(',')" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "41334eee", + "metadata": {}, + "outputs": [], + "source": [ + "def freeze_graph(model_dir, output_node_names):\n", + "\n", + " if not tf.gfile.Exists(model_dir):\n", + " raise AssertionError(\n", + " \"Export directory doesn't exists. Please specify an export \"\n", + " \"directory: %s\" % model_dir)\n", + "\n", + " checkpoint = tf.train.get_checkpoint_state(model_dir)\n", + " input_checkpoint = checkpoint.model_checkpoint_path\n", + " \n", + " absolute_model_dir = \"/\".join(input_checkpoint.split('/')[:-1])\n", + " output_graph = absolute_model_dir + \"/frozen_model.pb\"\n", + " clear_devices = True\n", + " with tf.Session(graph=tf.Graph()) as sess:\n", + " saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)\n", + " saver.restore(sess, input_checkpoint)\n", + " output_graph_def = tf.graph_util.convert_variables_to_constants(\n", + " sess,\n", + " tf.get_default_graph().as_graph_def(),\n", + " output_node_names.split(\",\")\n", + " ) \n", + " with tf.gfile.GFile(output_graph, \"wb\") as f:\n", + " f.write(output_graph_def.SerializeToString())\n", + " print(\"%d ops in the final graph.\" % len(output_graph_def.node))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "9fda0e24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_417540/1070649395.py:3: The name tf.gfile.Exists is deprecated. Please use tf.io.gfile.exists instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_417540/1070649395.py:14: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_417540/1070649395.py:15: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n", + "\n", + "INFO:tensorflow:Restoring parameters from lstm-bahdanau-jawi-rumi/model.ckpt\n", + "INFO:tensorflow:Froze 16 variables.\n", + "INFO:tensorflow:Converted 16 variables to const ops.\n", + "WARNING:tensorflow:From /tmp/ipykernel_417540/1070649395.py:22: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n", + "\n", + "1649 ops in the final graph.\n" + ] + } + ], + "source": [ + "freeze_graph(\"lstm-bahdanau-jawi-rumi\", strings)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "8d4adfc7", + "metadata": {}, + "outputs": [], + "source": [ + "def load_graph(frozen_graph_filename):\n", + " with tf.gfile.GFile(frozen_graph_filename, \"rb\") as f:\n", + " graph_def = tf.GraphDef()\n", + " graph_def.ParseFromString(f.read())\n", + " with tf.Graph().as_default() as graph:\n", + " tf.import_graph_def(graph_def)\n", + " return graph" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "9f937cb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_417540/3576390908.py:3: The name tf.GraphDef is deprecated. Please use tf.compat.v1.GraphDef instead.\n", + "\n" + ] + } + ], + "source": [ + "g=load_graph('lstm-bahdanau-jawi-rumi/frozen_model.pb')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "997bd77d", + "metadata": {}, + "outputs": [], + "source": [ + "x = g.get_tensor_by_name('import/Placeholder:0')\n", + "i_greedy = g.get_tensor_by_name('import/decode_1/greedy:0')\n", + "i_beam = g.get_tensor_by_name('import/decode_2/beam:0')" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "22a981d2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", + " warnings.warn('An interactive session is already active. This can '\n" + ] + } + ], + "source": [ + "test_sess = tf.InteractiveSession(graph=g)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "967a8820", + "metadata": {}, + "outputs": [], + "source": [ + "greedy, beam = test_sess.run([i_greedy, i_beam], feed_dict = {x: [batch]})" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "7a3d54c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'hidup ini sangat lah indah 123'" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "''.join([rev_left_dict[i] for i in greedy[0] if i > 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "08e7ca99", + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "from tensorflow.contrib.seq2seq.python.ops import beam_search_ops" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "48c7848d", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-01 21:22:34.211193: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying add_default_attributes\n", + "2022-06-01 21:22:34.217009: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying remove_nodes\n", + "2022-06-01 21:22:34.223354: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_1/greedy\n", + "2022-06-01 21:22:34.224264: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_2/beam\n", + "2022-06-01 21:22:34.232276: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_1/greedy\n", + "2022-06-01 21:22:34.233015: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_2/beam\n", + "2022-06-01 21:22:34.240001: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_1/greedy\n", + "2022-06-01 21:22:34.240684: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_2/beam\n", + "2022-06-01 21:22:34.261770: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying fold_batch_norms\n", + "2022-06-01 21:22:34.271178: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying fold_old_batch_norms\n", + "2022-06-01 21:22:34.296578: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying quantize_weights\n", + "2022-06-01 21:22:34.337517: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying strip_unused_nodes\n", + "2022-06-01 21:22:34.344108: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying sort_by_execution_order\n" + ] + } + ], + "source": [ + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']\n", + "\n", + "pb = 'lstm-bahdanau-jawi-rumi/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", + "\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " ['Placeholder'],\n", + " ['decode_1/greedy', 'decode_2/beam'], transforms)\n", + "\n", + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "261c7832", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lstm-bahdanau-jawi-rumi/\r\n", + "lstm-bahdanau-jawi-rumi/checkpoint\r\n", + "lstm-bahdanau-jawi-rumi/frozen_model.pb.quantized\r\n", + "lstm-bahdanau-jawi-rumi/model.ckpt.index\r\n", + "lstm-bahdanau-jawi-rumi/model.ckpt.data-00000-of-00001\r\n", + "lstm-bahdanau-jawi-rumi/model.ckpt.meta\r\n", + "lstm-bahdanau-jawi-rumi/frozen_model.pb\r\n" + ] + } + ], + "source": [ + "!tar -cvf lstm-bahdanau-jawi-rumi.tar lstm-bahdanau-jawi-rumi" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "28ace921", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from malaya_boilerplate.huggingface import upload_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "8c6f9ccd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/huggingface_hub/hf_api.py:79: FutureWarning: `name` and `organization` input arguments are deprecated and will be removed in v0.8. Pass `repo_id` instead.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "files_mapping = {'lstm-bahdanau-jawi-rumi/frozen_model.pb': 'model.pb'}\n", + "upload_dict(model = 'jawi-rumi-lstm-bahdanau', files_mapping = files_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "1cbd614c", + "metadata": {}, + "outputs": [], + "source": [ + "files_mapping = {'lstm-bahdanau-jawi-rumi/frozen_model.pb.quantized': 'model.pb'}\n", + "upload_dict(model = 'jawi-rumi-lstm-bahdanau-quantized', files_mapping = files_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "f7bcbd6f", + "metadata": {}, + "outputs": [], + "source": [ + "files_mapping = {'lstm-bahdanau-jawi-rumi.tar': 'lstm-bahdanau-jawi-rumi.tar'}\n", + "upload_dict(model = 'pretrained-jawi-rumi', files_mapping = files_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "6010c13c", + "metadata": {}, + "outputs": [], + "source": [ + "# !~/tf-nvidia/bin/pip3 install python-Levenshtein\n", + "\n", + "def calculate_cer(actual, hyp):\n", + " \"\"\"\n", + " Calculate CER using `python-Levenshtein`.\n", + " \"\"\"\n", + " import Levenshtein as Lev\n", + "\n", + " actual = actual.replace(' ', '')\n", + " hyp = hyp.replace(' ', '')\n", + " return Lev.distance(actual, hyp) / len(actual)\n", + "\n", + "\n", + "def calculate_wer(actual, hyp):\n", + " \"\"\"\n", + " Calculate WER using `python-Levenshtein`.\n", + " \"\"\"\n", + " import Levenshtein as Lev\n", + "\n", + " b = set(actual.split() + hyp.split())\n", + " word2char = dict(zip(b, range(len(b))))\n", + "\n", + " w1 = [chr(word2char[w]) for w in actual.split()]\n", + " w2 = [chr(word2char[w]) for w in hyp.split()]\n", + "\n", + " return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())\n", + "\n", + "def decode(ids):\n", + " return ''.join([rev_left_dict[i] for i in ids if i > 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "101d8a59", + "metadata": {}, + "outputs": [], + "source": [ + "i = 0\n", + "index = 10\n", + "batch_x = [[right_dict[c] for c in s] + [1] for s in test_X[i: index]]\n", + "batch_y = [[left_dict[c] for c in s] + [1] for s in test_Y[i: index]]\n", + "batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + "batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + "feed = {model.X: batch_x,\n", + " model.Y: batch_y,}\n", + "greedy, beam = sess.run([model.greedy, model.beam], feed_dict = feed)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "5e6d4d6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.3383333333333333, 0.08239819004524887)" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wer, cer = [], []\n", + "for k in range(len(batch_x)):\n", + " d_left = decode(batch_y[k])\n", + " d_right = decode(greedy[k])\n", + " wer.append(calculate_wer(d_left, d_right))\n", + " cer.append(calculate_cer(d_left, d_right))\n", + " \n", + "np.mean(wer), np.mean(cer)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "2eaa6aa4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1730/1730 [08:33<00:00, 3.37it/s]\n" + ] + } + ], + "source": [ + "wer, cer = [], []\n", + "for i in tqdm.tqdm(range(0, len(test_X), batch_size)):\n", + " index = min(i + batch_size, len(test_X))\n", + " batch_x = [[right_dict[c] for c in s] + [1] for s in test_X[i: index]]\n", + " batch_y = [[left_dict[c] for c in s] + [1] for s in test_Y[i: index]]\n", + " batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + " batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + " feed = {model.X: batch_x,\n", + " model.Y: batch_y,}\n", + " greedy, beam = sess.run([model.greedy, model.beam], feed_dict = feed)\n", + " for k in range(len(batch_x)):\n", + " d_left = decode(batch_y[k])\n", + " d_right = decode(greedy[k])\n", + " wer.append(calculate_wer(d_left, d_right))\n", + " cer.append(calculate_cer(d_left, d_right))" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6d53f833", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.33811816744187656, 0.09239719040982326)" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(wer), np.mean(cer)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tf1", + "language": "python", + "name": "tf1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/session/rumi-jawi/lstm-bahdanau.ipynb b/session/rumi-jawi/lstm-bahdanau.ipynb new file mode 100644 index 00000000..060c9fb9 --- /dev/null +++ b/session/rumi-jawi/lstm-bahdanau.ipynb @@ -0,0 +1,1398 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a3844223", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "99fff94b", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "import json\n", + "from glob import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "53dc44e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0, 1, 2, 3]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PAD = 0\n", + "EOS = 1\n", + "UNK = 2\n", + "GO = 3\n", + "[PAD, EOS, UNK, GO]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "98c4fe31", + "metadata": {}, + "outputs": [], + "source": [ + "left_dict = {0: 0,\n", + " 1: 1,\n", + " 2: 2,\n", + " 3: 3,\n", + " ' ': 4,\n", + " '!': 5,\n", + " '\"': 6,\n", + " \"'\": 7,\n", + " '(': 8,\n", + " ')': 9,\n", + " '+': 10,\n", + " ',': 11,\n", + " '-': 12,\n", + " '.': 13,\n", + " '0': 14,\n", + " '1': 15,\n", + " '2': 16,\n", + " '3': 17,\n", + " '4': 18,\n", + " '5': 19,\n", + " '6': 20,\n", + " '7': 21,\n", + " '8': 22,\n", + " '9': 23,\n", + " ':': 24,\n", + " ';': 25,\n", + " '?': 26,\n", + " 'A': 27,\n", + " 'B': 28,\n", + " 'C': 29,\n", + " 'D': 30,\n", + " 'E': 31,\n", + " 'F': 32,\n", + " 'G': 33,\n", + " 'H': 34,\n", + " 'I': 35,\n", + " 'J': 36,\n", + " 'K': 37,\n", + " 'L': 38,\n", + " 'M': 39,\n", + " 'N': 40,\n", + " 'O': 41,\n", + " 'P': 42,\n", + " 'Q': 43,\n", + " 'R': 44,\n", + " 'S': 45,\n", + " 'T': 46,\n", + " 'U': 47,\n", + " 'V': 48,\n", + " 'W': 49,\n", + " 'X': 50,\n", + " 'Y': 51,\n", + " 'Z': 52,\n", + " 'a': 53,\n", + " 'b': 54,\n", + " 'c': 55,\n", + " 'd': 56,\n", + " 'e': 57,\n", + " 'f': 58,\n", + " 'g': 59,\n", + " 'h': 60,\n", + " 'i': 61,\n", + " 'j': 62,\n", + " 'k': 63,\n", + " 'l': 64,\n", + " 'm': 65,\n", + " 'n': 66,\n", + " 'o': 67,\n", + " 'p': 68,\n", + " 'q': 69,\n", + " 'r': 70,\n", + " 's': 71,\n", + " 't': 72,\n", + " 'u': 73,\n", + " 'v': 74,\n", + " 'w': 75,\n", + " 'x': 76,\n", + " 'y': 77,\n", + " 'z': 78}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8cb15445", + "metadata": {}, + "outputs": [], + "source": [ + "right_dict = {0: 0,\n", + " 1: 1,\n", + " 2: 2,\n", + " 3: 3,\n", + " ' ': 4,\n", + " '!': 5,\n", + " '\"': 6,\n", + " '-': 7,\n", + " '.': 8,\n", + " ':': 9,\n", + " ';': 10,\n", + " '،': 11,\n", + " '؟': 12,\n", + " 'ء': 13,\n", + " 'آ': 14,\n", + " 'أ': 15,\n", + " 'ؤ': 16,\n", + " 'إ': 17,\n", + " 'ئ': 18,\n", + " 'ا': 19,\n", + " 'ب': 20,\n", + " 'ة': 21,\n", + " 'ت': 22,\n", + " 'ث': 23,\n", + " 'ج': 24,\n", + " 'ح': 25,\n", + " 'خ': 26,\n", + " 'د': 27,\n", + " 'ذ': 28,\n", + " 'ر': 29,\n", + " 'ز': 30,\n", + " 'س': 31,\n", + " 'ش': 32,\n", + " 'ص': 33,\n", + " 'ض': 34,\n", + " 'ط': 35,\n", + " 'ظ': 36,\n", + " 'ع': 37,\n", + " 'غ': 38,\n", + " 'ف': 39,\n", + " 'ق': 40,\n", + " 'ك': 41,\n", + " 'ل': 42,\n", + " 'م': 43,\n", + " 'ن': 44,\n", + " 'ه': 45,\n", + " 'و': 46,\n", + " 'ى': 47,\n", + " 'ي': 48,\n", + " 'ّ': 49,\n", + " 'ٓ': 50,\n", + " '٠': 51,\n", + " '١': 52,\n", + " '٢': 53,\n", + " '٣': 54,\n", + " '٤': 55,\n", + " '٥': 56,\n", + " '٦': 57,\n", + " '٧': 58,\n", + " '٨': 59,\n", + " '٩': 60,\n", + " 'چ': 61,\n", + " 'ڠ': 62,\n", + " 'ڤ': 63,\n", + " 'ڬ': 64,\n", + " 'ڽ': 65,\n", + " 'ۏ': 66,\n", + " '﴾': 67,\n", + " '﴿': 68}\n", + "rev_right_dict = {v: k for k, v in right_dict.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "407e1aab", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['كاوسن كڤولاوان سڤراتلي يڠ',\n", + " 'ڤرليمين ڤرسكوتوان اونتوق',\n", + " 'ڤنوبوهن تامن سينر',\n", + " 'ڤريڠكت كمنترين، كاتڽ.',\n", + " 'تله مندرم سباڽق تيڬ',\n", + " 'هاري اين،',\n", + " 'برتوليرنسي\"',\n", + " 'مڠيسهكان',\n", + " 'سوڠ-قواڠ',\n", + " 'سيبير دڠن باجو بياسا.']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open('jawi-set.json') as fopen:\n", + " jawi = json.load(fopen)\n", + "jawi['train'][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1cb0953d", + "metadata": {}, + "outputs": [], + "source": [ + "with open('rumi-set.json') as fopen:\n", + " rumi = json.load(fopen)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0783c43c", + "metadata": {}, + "outputs": [], + "source": [ + "train_X = rumi['train']\n", + "train_Y = jawi['train']\n", + "\n", + "test_X = rumi['test']\n", + "test_Y = jawi['test']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0de00f6f", + "metadata": {}, + "outputs": [], + "source": [ + "class Translator:\n", + " def __init__(self, size_layer, num_layers, embedded_size,\n", + " from_dict_size, to_dict_size, learning_rate, beam_width = 10):\n", + " \n", + " def cells(reuse=False):\n", + " return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)\n", + " \n", + " def attention(encoder_out, seq_len, reuse=False):\n", + " attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units = size_layer, \n", + " memory = encoder_out,\n", + " memory_sequence_length = seq_len)\n", + " return tf.contrib.seq2seq.AttentionWrapper(\n", + " cell = tf.nn.rnn_cell.MultiRNNCell([cells(reuse) for _ in range(num_layers)]), \n", + " attention_mechanism = attention_mechanism,\n", + " attention_layer_size = size_layer)\n", + " \n", + " self.X = tf.placeholder(tf.int32, [None, None])\n", + " self.Y = tf.placeholder(tf.int32, [None, None])\n", + " self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)\n", + " self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)\n", + " batch_size = tf.shape(self.X)[0]\n", + " \n", + " encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))\n", + " decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))\n", + " \n", + " encoder_out, encoder_state = tf.nn.dynamic_rnn(\n", + " cell = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), \n", + " inputs = tf.nn.embedding_lookup(encoder_embedding, self.X),\n", + " sequence_length = self.X_seq_len,\n", + " dtype = tf.float32)\n", + " main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])\n", + " decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)\n", + " dense = tf.layers.Dense(to_dict_size)\n", + " \n", + " with tf.variable_scope('decode'):\n", + " decoder_cells = attention(encoder_out, self.X_seq_len)\n", + " training_helper = tf.contrib.seq2seq.TrainingHelper(\n", + " inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_input),\n", + " sequence_length = self.Y_seq_len,\n", + " time_major = False)\n", + " training_decoder = tf.contrib.seq2seq.BasicDecoder(\n", + " cell = decoder_cells,\n", + " helper = training_helper,\n", + " initial_state = decoder_cells.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),\n", + " output_layer = dense)\n", + " training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(\n", + " decoder = training_decoder,\n", + " impute_finished = True,\n", + " maximum_iterations = tf.reduce_max(self.Y_seq_len))\n", + " self.training_logits = training_decoder_output.rnn_output\n", + " \n", + " with tf.variable_scope('decode', reuse=True):\n", + " predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(\n", + " embedding = decoder_embedding,\n", + " start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),\n", + " end_token = EOS)\n", + " predicting_decoder = tf.contrib.seq2seq.BasicDecoder(\n", + " cell = decoder_cells,\n", + " helper = predicting_helper,\n", + " initial_state = decoder_cells.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),\n", + " output_layer = dense)\n", + " predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(\n", + " decoder = predicting_decoder,\n", + " impute_finished = True,\n", + " maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))\n", + " self.greedy = predicting_decoder_output.sample_id\n", + " self.greedy = tf.identity(self.greedy,name='greedy')\n", + " \n", + " with tf.variable_scope('decode', reuse=True):\n", + " \n", + " encoder_out_tiled = tf.contrib.seq2seq.tile_batch(encoder_out, beam_width)\n", + " encoder_state_tiled = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)\n", + " X_seq_len_tiled = tf.contrib.seq2seq.tile_batch(self.X_seq_len, beam_width)\n", + " decoder_cell = attention(encoder_out_tiled, X_seq_len_tiled, reuse=True)\n", + " \n", + " predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(\n", + " cell = decoder_cell,\n", + " embedding = decoder_embedding,\n", + " start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),\n", + " end_token = EOS,\n", + " initial_state = decoder_cell.zero_state(batch_size * beam_width, tf.float32).clone(\n", + " cell_state = encoder_state_tiled),\n", + " beam_width = beam_width,\n", + " output_layer = dense,\n", + " length_penalty_weight = 0.0)\n", + " \n", + " predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(\n", + " decoder = predicting_decoder,\n", + " impute_finished = False,\n", + " maximum_iterations = tf.reduce_max(self.X_seq_len))\n", + " \n", + " self.beam = predicting_decoder_output.predicted_ids[:, :, 0]\n", + " self.beam = tf.identity(self.beam,name='beam')\n", + " \n", + " masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)\n", + " self.masks = masks\n", + " self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,\n", + " targets = self.Y,\n", + " weights = masks)\n", + " self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)\n", + " y_t = tf.argmax(self.training_logits,axis=2)\n", + " y_t = tf.cast(y_t, tf.int32)\n", + " self.prediction = tf.boolean_mask(y_t, masks)\n", + " mask_label = tf.boolean_mask(self.Y, masks)\n", + " correct_pred = tf.equal(self.prediction, mask_label)\n", + " correct_index = tf.cast(correct_pred, tf.float32)\n", + " self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7197fd33", + "metadata": {}, + "outputs": [], + "source": [ + "size_layer = 256\n", + "num_layers = 2\n", + "embedded_size = 256\n", + "learning_rate = 1e-3\n", + "batch_size = 32\n", + "epoch = 20" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3a034214", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_369390/221865152.py:1: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/221865152.py:2: The name tf.InteractiveSession is deprecated. Please use tf.compat.v1.InteractiveSession instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/2944119294.py:17: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/2944119294.py:23: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/2944119294.py:33: The name tf.layers.Dense is deprecated. Please use tf.compat.v1.layers.Dense instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/2944119294.py:35: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", + "\n", + "WARNING:tensorflow:\n", + "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", + "For more information, please see:\n", + " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", + " * https://github.com/tensorflow/addons\n", + " * https://github.com/tensorflow/io (for I/O related ops)\n", + "If you depend on functionality not listed there, please file an issue.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-01 10:54:13.304553: I tensorflow/core/platform/profile_utils/cpu_utils.cc:109] CPU Frequency: 2496000000 Hz\n", + "2022-06-01 10:54:13.304949: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3b5d1c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2022-06-01 10:54:13.304962: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n", + "2022-06-01 10:54:13.306018: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1\n", + "2022-06-01 10:54:13.308229: E tensorflow/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", + "2022-06-01 10:54:13.308244: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop\n", + "2022-06-01 10:54:13.308248: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop\n", + "2022-06-01 10:54:13.308280: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.129.6\n", + "2022-06-01 10:54:13.308298: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6\n", + "2022-06-01 10:54:13.308303: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.129.6\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_369390/2944119294.py:100: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/221865152.py:4: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.\n", + "\n" + ] + } + ], + "source": [ + "tf.reset_default_graph()\n", + "sess = tf.InteractiveSession()\n", + "model = Translator(size_layer, num_layers, embedded_size, len(left_dict), len(right_dict), learning_rate)\n", + "sess.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cfeb1357", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_369390/1341464457.py:1: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/1341464457.py:1: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'lstm-rumi-jawi/model.ckpt'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "saver = tf.train.Saver(tf.trainable_variables())\n", + "saver.save(sess, 'lstm-rumi-jawi/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4cd7109c", + "metadata": {}, + "outputs": [], + "source": [ + "def pad_sentence_batch(sentence_batch, pad_int):\n", + " padded_seqs = []\n", + " seq_lens = []\n", + " max_sentence_len = max([len(sentence) for sentence in sentence_batch])\n", + " for sentence in sentence_batch:\n", + " padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))\n", + " seq_lens.append(len(sentence))\n", + " return padded_seqs, seq_lens" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a44d95fc", + "metadata": {}, + "outputs": [], + "source": [ + "batch_x = [[left_dict[c] for c in s] + [1] for s in train_X[i: i + 5]]\n", + "batch_y = [[right_dict[c] for c in s] + [1] for s in train_Y[i: i + 5]]\n", + "batch_x, _ = pad_sentence_batch(batch_x, 0)\n", + "batch_y, _ = pad_sentence_batch(batch_y, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e44c2c00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 20)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.array(batch_y).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "65f1ab08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 29)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.array(batch_x).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "67f53b4e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 496 ms, sys: 7.86 ms, total: 504 ms\n", + "Wall time: 55.8 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "[4.0693455, None]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "sess.run([model.cost, model.optimizer], feed_dict = {model.X: batch_x, model.Y: batch_y})" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ff00eb04", + "metadata": {}, + "outputs": [], + "source": [ + "greedy, beam = sess.run([model.greedy, model.beam], feed_dict = {model.X: batch_x[:1]})" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6ae9aaba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([[44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,\n", + " 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,\n", + " 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,\n", + " 44, 44, 44, 44, 44, 44, 44, 44, 44, 44]], dtype=int32),\n", + " array([[46, 44, 44, 44, 44, 44, 44, 4, 44, 44, 44, 44, 44, 44, 4, 44,\n", + " 44, 44, 44, 44, 4, 44, 44, 44, 44, 4, 44, 44, 44]], dtype=int32))" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "greedy, beam" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "ec0d3145", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "minibatch loop: 31%|███ | 4768/15566 [17:19<35:09, 5.12it/s, accuracy=1, cost=0.00731] IOPub message rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_msg_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n", + "minibatch loop: 65%|██████▍ | 10087/15566 [36:43<20:23, 4.48it/s, accuracy=0.998, cost=0.00692]IOPub message rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_msg_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n", + "minibatch loop: 94%|█████████▍| 14691/15566 [53:33<03:08, 4.63it/s, accuracy=0.993, cost=0.0192] IOPub message rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_msg_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n", + "minibatch loop: 100%|██████████| 15566/15566 [56:57<00:00, 4.56it/s, accuracy=0.981, cost=0.0478] \n", + "minibatch loop: 100%|██████████| 1730/1730 [01:48<00:00, 15.93it/s, accuracy=0.98, cost=0.0507] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch 2, training avg loss 0.027398, training avg acc 0.992040\n", + "epoch 2, testing avg loss 0.026345, testing avg acc 0.992578\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import tqdm\n", + "from sklearn.utils import shuffle\n", + "\n", + "for e in range(2):\n", + " train_X, train_Y = shuffle(train_X, train_Y)\n", + " pbar = tqdm.tqdm(\n", + " range(0, len(train_X), batch_size), desc = 'minibatch loop')\n", + " train_loss, train_acc, test_loss, test_acc = [], [], [], []\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(train_X))\n", + " batch_x = [[left_dict[c] for c in s] + [1] for s in train_X[i: index]]\n", + " batch_y = [[right_dict[c] for c in s] + [1] for s in train_Y[i: index]]\n", + " batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + " batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + " feed = {model.X: batch_x,\n", + " model.Y: batch_y}\n", + " accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],\n", + " feed_dict = feed)\n", + " train_loss.append(loss)\n", + " train_acc.append(accuracy)\n", + " pbar.set_postfix(cost = loss, accuracy = accuracy)\n", + " \n", + " pbar = tqdm.tqdm(\n", + " range(0, len(test_X), batch_size), desc = 'minibatch loop')\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(test_X))\n", + " batch_x = [[left_dict[c] for c in s] + [1] for s in test_X[i: index]]\n", + " batch_y = [[right_dict[c] for c in s] + [1] for s in test_Y[i: index]]\n", + " batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + " batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + " feed = {model.X: batch_x,\n", + " model.Y: batch_y,}\n", + " accuracy, loss = sess.run([model.accuracy,model.cost],\n", + " feed_dict = feed)\n", + "\n", + " test_loss.append(loss)\n", + " test_acc.append(accuracy)\n", + " pbar.set_postfix(cost = loss, accuracy = accuracy)\n", + " \n", + " print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,\n", + " np.mean(train_loss),np.mean(train_acc)))\n", + " print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,\n", + " np.mean(test_loss),np.mean(test_acc)))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "6ad3a562", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'lstm-bahdanau-rumi-jawi/model.ckpt'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "saver = tf.train.Saver(tf.trainable_variables())\n", + "saver.save(sess, 'lstm-bahdanau-rumi-jawi/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "15f222d2", + "metadata": {}, + "outputs": [], + "source": [ + "string = 'comel'\n", + "batch = [left_dict[c] for c in string] + [1]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "b8aa7b02", + "metadata": {}, + "outputs": [], + "source": [ + "greedy, beam = sess.run([model.greedy, model.beam], feed_dict = {model.X: [batch]})" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "18a45fc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([[61, 46, 43, 48, 42, 1]], dtype=int32),\n", + " array([[61, 46, 43, 48, 42, 1]], dtype=int32))" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "greedy, beam" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "64fdbd7d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'چوميل'" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "''.join([rev_right_dict[i] for i in greedy[0] if i > 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "3f836ac6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_369390/3818740975.py:4: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "['Placeholder',\n", + " 'Placeholder_1',\n", + " 'Variable',\n", + " 'Variable_1',\n", + " 'rnn/multi_rnn_cell/cell_0/lstm_cell/kernel',\n", + " 'rnn/multi_rnn_cell/cell_0/lstm_cell/bias',\n", + " 'rnn/multi_rnn_cell/cell_1/lstm_cell/kernel',\n", + " 'rnn/multi_rnn_cell/cell_1/lstm_cell/bias',\n", + " 'decode/memory_layer/kernel',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/kernel',\n", + " 'decode/decoder/attention_wrapper/multi_rnn_cell/cell_1/lstm_cell/bias',\n", + " 'decode/decoder/attention_wrapper/bahdanau_attention/query_layer/kernel',\n", + " 'decode/decoder/attention_wrapper/bahdanau_attention/attention_v',\n", + " 'decode/decoder/attention_wrapper/attention_layer/kernel',\n", + " 'decode/decoder/dense/kernel',\n", + " 'decode/decoder/dense/bias',\n", + " 'decode_1/greedy',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/beam_width',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/range/start',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/range/delta',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/range',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/mul/y',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/mul',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/ExpandDims/dim',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/ExpandDims',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/add',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape/shape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape_1/shape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Reshape_1',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/GatherV2/axis',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/GatherV2',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/Shape',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice/stack',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice/stack_1',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice/stack_2',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/strided_slice',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_probs/output',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_word_ids/y',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_word_ids',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_parent_ids',\n", + " 'decode_2/decoder/while/BeamSearchDecoderStep/next_beam_finished',\n", + " 'decode_2/beam']" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "strings = ','.join(\n", + " [\n", + " n.name\n", + " for n in tf.get_default_graph().as_graph_def().node\n", + " if ('Variable' in n.op\n", + " or 'Placeholder' in n.name\n", + " or 'greedy' in n.name\n", + " or 'beam' in n.name\n", + " or 'alphas' in n.name)\n", + " and 'Adam' not in n.name\n", + " and 'beta' not in n.name\n", + " and 'OptimizeLoss' not in n.name\n", + " and 'Global_Step' not in n.name\n", + " ]\n", + ")\n", + "strings.split(',')" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "41334eee", + "metadata": {}, + "outputs": [], + "source": [ + "def freeze_graph(model_dir, output_node_names):\n", + "\n", + " if not tf.gfile.Exists(model_dir):\n", + " raise AssertionError(\n", + " \"Export directory doesn't exists. Please specify an export \"\n", + " \"directory: %s\" % model_dir)\n", + "\n", + " checkpoint = tf.train.get_checkpoint_state(model_dir)\n", + " input_checkpoint = checkpoint.model_checkpoint_path\n", + " \n", + " absolute_model_dir = \"/\".join(input_checkpoint.split('/')[:-1])\n", + " output_graph = absolute_model_dir + \"/frozen_model.pb\"\n", + " clear_devices = True\n", + " with tf.Session(graph=tf.Graph()) as sess:\n", + " saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)\n", + " saver.restore(sess, input_checkpoint)\n", + " output_graph_def = tf.graph_util.convert_variables_to_constants(\n", + " sess,\n", + " tf.get_default_graph().as_graph_def(),\n", + " output_node_names.split(\",\")\n", + " ) \n", + " with tf.gfile.GFile(output_graph, \"wb\") as f:\n", + " f.write(output_graph_def.SerializeToString())\n", + " print(\"%d ops in the final graph.\" % len(output_graph_def.node))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "9fda0e24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_369390/1070649395.py:3: The name tf.gfile.Exists is deprecated. Please use tf.io.gfile.exists instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/1070649395.py:14: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n", + "\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/1070649395.py:15: The name tf.train.import_meta_graph is deprecated. Please use tf.compat.v1.train.import_meta_graph instead.\n", + "\n", + "INFO:tensorflow:Restoring parameters from lstm-bahdanau-rumi-jawi/model.ckpt\n", + "INFO:tensorflow:Froze 16 variables.\n", + "INFO:tensorflow:Converted 16 variables to const ops.\n", + "WARNING:tensorflow:From /tmp/ipykernel_369390/1070649395.py:22: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n", + "\n", + "1649 ops in the final graph.\n" + ] + } + ], + "source": [ + "freeze_graph(\"lstm-bahdanau-rumi-jawi\", strings)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "8d4adfc7", + "metadata": {}, + "outputs": [], + "source": [ + "def load_graph(frozen_graph_filename):\n", + " with tf.gfile.GFile(frozen_graph_filename, \"rb\") as f:\n", + " graph_def = tf.GraphDef()\n", + " graph_def.ParseFromString(f.read())\n", + " with tf.Graph().as_default() as graph:\n", + " tf.import_graph_def(graph_def)\n", + " return graph" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "9f937cb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_369390/3576390908.py:3: The name tf.GraphDef is deprecated. Please use tf.compat.v1.GraphDef instead.\n", + "\n" + ] + } + ], + "source": [ + "g=load_graph('lstm-bahdanau-rumi-jawi/frozen_model.pb')" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "997bd77d", + "metadata": {}, + "outputs": [], + "source": [ + "x = g.get_tensor_by_name('import/Placeholder:0')\n", + "i_greedy = g.get_tensor_by_name('import/decode_1/greedy:0')\n", + "i_beam = g.get_tensor_by_name('import/decode_2/beam:0')" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "22a981d2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", + " warnings.warn('An interactive session is already active. This can '\n" + ] + } + ], + "source": [ + "test_sess = tf.InteractiveSession(graph=g)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "967a8820", + "metadata": {}, + "outputs": [], + "source": [ + "greedy, beam = test_sess.run([i_greedy, i_beam], feed_dict = {x: [batch]})" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "7a3d54c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'چوميل'" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "''.join([rev_right_dict[i] for i in greedy[0] if i > 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "08e7ca99", + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "from tensorflow.contrib.seq2seq.python.ops import beam_search_ops" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "48c7848d", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-01 15:34:36.865453: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying add_default_attributes\n", + "2022-06-01 15:34:36.870972: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying remove_nodes\n", + "2022-06-01 15:34:36.876625: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_1/greedy\n", + "2022-06-01 15:34:36.877407: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_2/beam\n", + "2022-06-01 15:34:36.884702: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_1/greedy\n", + "2022-06-01 15:34:36.885274: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_2/beam\n", + "2022-06-01 15:34:36.891508: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_1/greedy\n", + "2022-06-01 15:34:36.892066: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for decode_2/beam\n", + "2022-06-01 15:34:36.911687: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying fold_batch_norms\n", + "2022-06-01 15:34:36.920762: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying fold_old_batch_norms\n", + "2022-06-01 15:34:36.945159: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying quantize_weights\n", + "2022-06-01 15:34:36.983036: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying strip_unused_nodes\n", + "2022-06-01 15:34:36.989182: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying sort_by_execution_order\n" + ] + } + ], + "source": [ + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']\n", + "\n", + "pb = 'lstm-bahdanau-rumi-jawi/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", + "\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " ['Placeholder'],\n", + " ['decode_1/greedy', 'decode_2/beam'], transforms)\n", + "\n", + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "261c7832", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lstm-bahdanau-rumi-jawi/\r\n", + "lstm-bahdanau-rumi-jawi/checkpoint\r\n", + "lstm-bahdanau-rumi-jawi/frozen_model.pb.quantized\r\n", + "lstm-bahdanau-rumi-jawi/model.ckpt.index\r\n", + "lstm-bahdanau-rumi-jawi/model.ckpt.data-00000-of-00001\r\n", + "lstm-bahdanau-rumi-jawi/model.ckpt.meta\r\n", + "lstm-bahdanau-rumi-jawi/frozen_model.pb\r\n" + ] + } + ], + "source": [ + "!tar -cvf lstm-bahdanau-rumi-jawi.tar lstm-bahdanau-rumi-jawi" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "28ace921", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from malaya_boilerplate.huggingface import upload_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "8c6f9ccd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/huggingface_hub/hf_api.py:79: FutureWarning: `name` and `organization` input arguments are deprecated and will be removed in v0.8. Pass `repo_id` instead.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "files_mapping = {'lstm-bahdanau-rumi-jawi/frozen_model.pb': 'model.pb'}\n", + "upload_dict(model = 'rumi-jawi-lstm-bahdanau', files_mapping = files_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "1cbd614c", + "metadata": {}, + "outputs": [], + "source": [ + "files_mapping = {'lstm-bahdanau-rumi-jawi/frozen_model.pb.quantized': 'model.pb'}\n", + "upload_dict(model = 'rumi-jawi-lstm-bahdanau-quantized', files_mapping = files_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "f7bcbd6f", + "metadata": {}, + "outputs": [], + "source": [ + "files_mapping = {'lstm-bahdanau-rumi-jawi.tar': 'lstm-bahdanau-rumi-jawi.tar'}\n", + "upload_dict(model = 'pretrained-rumi-jawi', files_mapping = files_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "6010c13c", + "metadata": {}, + "outputs": [], + "source": [ + "# !~/tf-nvidia/bin/pip3 install python-Levenshtein\n", + "\n", + "def calculate_cer(actual, hyp):\n", + " \"\"\"\n", + " Calculate CER using `python-Levenshtein`.\n", + " \"\"\"\n", + " import Levenshtein as Lev\n", + "\n", + " actual = actual.replace(' ', '')\n", + " hyp = hyp.replace(' ', '')\n", + " return Lev.distance(actual, hyp) / len(actual)\n", + "\n", + "\n", + "def calculate_wer(actual, hyp):\n", + " \"\"\"\n", + " Calculate WER using `python-Levenshtein`.\n", + " \"\"\"\n", + " import Levenshtein as Lev\n", + "\n", + " b = set(actual.split() + hyp.split())\n", + " word2char = dict(zip(b, range(len(b))))\n", + "\n", + " w1 = [chr(word2char[w]) for w in actual.split()]\n", + " w2 = [chr(word2char[w]) for w in hyp.split()]\n", + "\n", + " return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())\n", + "\n", + "def decode(ids):\n", + " return ''.join([rev_right_dict[i] for i in ids if i > 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "101d8a59", + "metadata": {}, + "outputs": [], + "source": [ + "i = 0\n", + "index = 10\n", + "batch_x = [[left_dict[c] for c in s] + [1] for s in test_X[i: index]]\n", + "batch_y = [[right_dict[c] for c in s] + [1] for s in test_Y[i: index]]\n", + "batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + "batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + "feed = {model.X: batch_x,\n", + " model.Y: batch_y,}\n", + "greedy, beam = sess.run([model.greedy, model.beam], feed_dict = feed)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "5e6d4d6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.03333333333333333, 0.006666666666666666)" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wer, cer = [], []\n", + "for k in range(len(batch_x)):\n", + " d_left = decode(batch_y[k])\n", + " d_right = decode(greedy[k])\n", + " wer.append(calculate_wer(d_left, d_right))\n", + " cer.append(calculate_cer(d_left, d_right))\n", + " \n", + "np.mean(wer), np.mean(cer)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "2eaa6aa4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1730/1730 [09:41<00:00, 2.98it/s]\n" + ] + } + ], + "source": [ + "wer, cer = [], []\n", + "for i in tqdm.tqdm(range(0, len(test_X), batch_size)):\n", + " index = min(i + batch_size, len(test_X))\n", + " batch_x = [[left_dict[c] for c in s] + [1] for s in test_X[i: index]]\n", + " batch_y = [[right_dict[c] for c in s] + [1] for s in test_Y[i: index]]\n", + " batch_x, _ = pad_sentence_batch(batch_x, PAD)\n", + " batch_y, _ = pad_sentence_batch(batch_y, PAD)\n", + " feed = {model.X: batch_x,\n", + " model.Y: batch_y,}\n", + " greedy, beam = sess.run([model.greedy, model.beam], feed_dict = feed)\n", + " for k in range(len(batch_x)):\n", + " d_left = decode(batch_y[k])\n", + " d_right = decode(greedy[k])\n", + " wer.append(calculate_wer(d_left, d_right))\n", + " cer.append(calculate_cer(d_left, d_right))" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "6d53f833", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.06737832963079593, 0.014847105998349451)" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(wer), np.mean(cer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51e327ee", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tf1", + "language": "python", + "name": "tf1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/session/rumi-jawi/prepare-t2t-dev.ipynb b/session/rumi-jawi/prepare-t2t-dev.ipynb new file mode 100644 index 00000000..9cb7eb2f --- /dev/null +++ b/session/rumi-jawi/prepare-t2t-dev.ipynb @@ -0,0 +1,537 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a3844223", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fade1c44", + "metadata": {}, + "outputs": [], + "source": [ + "left_dict = {\n", + " 0: 0,\n", + " 1: 1,\n", + " 2: 2,\n", + " 3: 3,\n", + " ' ': 4,\n", + " '!': 5,\n", + " '\"': 6,\n", + " \"'\": 7,\n", + " '(': 8,\n", + " ')': 9,\n", + " '+': 10,\n", + " ',': 11,\n", + " '-': 12,\n", + " '.': 13,\n", + " '0': 14,\n", + " '1': 15,\n", + " '2': 16,\n", + " '3': 17,\n", + " '4': 18,\n", + " '5': 19,\n", + " '6': 20,\n", + " '7': 21,\n", + " '8': 22,\n", + " '9': 23,\n", + " ':': 24,\n", + " ';': 25,\n", + " '?': 26,\n", + " 'A': 27,\n", + " 'B': 28,\n", + " 'C': 29,\n", + " 'D': 30,\n", + " 'E': 31,\n", + " 'F': 32,\n", + " 'G': 33,\n", + " 'H': 34,\n", + " 'I': 35,\n", + " 'J': 36,\n", + " 'K': 37,\n", + " 'L': 38,\n", + " 'M': 39,\n", + " 'N': 40,\n", + " 'O': 41,\n", + " 'P': 42,\n", + " 'Q': 43,\n", + " 'R': 44,\n", + " 'S': 45,\n", + " 'T': 46,\n", + " 'U': 47,\n", + " 'V': 48,\n", + " 'W': 49,\n", + " 'X': 50,\n", + " 'Y': 51,\n", + " 'Z': 52,\n", + " 'a': 53,\n", + " 'b': 54,\n", + " 'c': 55,\n", + " 'd': 56,\n", + " 'e': 57,\n", + " 'f': 58,\n", + " 'g': 59,\n", + " 'h': 60,\n", + " 'i': 61,\n", + " 'j': 62,\n", + " 'k': 63,\n", + " 'l': 64,\n", + " 'm': 65,\n", + " 'n': 66,\n", + " 'o': 67,\n", + " 'p': 68,\n", + " 'q': 69,\n", + " 'r': 70,\n", + " 's': 71,\n", + " 't': 72,\n", + " 'u': 73,\n", + " 'v': 74,\n", + " 'w': 75,\n", + " 'x': 76,\n", + " 'y': 77,\n", + " 'z': 78,\n", + " '،': 79,\n", + " '؟': 80,\n", + " 'ء': 81,\n", + " 'آ': 82,\n", + " 'أ': 83,\n", + " 'ؤ': 84,\n", + " 'إ': 85,\n", + " 'ئ': 86,\n", + " 'ا': 87,\n", + " 'ب': 88,\n", + " 'ة': 89,\n", + " 'ت': 90,\n", + " 'ث': 91,\n", + " 'ج': 92,\n", + " 'ح': 93,\n", + " 'خ': 94,\n", + " 'د': 95,\n", + " 'ذ': 96,\n", + " 'ر': 97,\n", + " 'ز': 98,\n", + " 'س': 99,\n", + " 'ش': 100,\n", + " 'ص': 101,\n", + " 'ض': 102,\n", + " 'ط': 103,\n", + " 'ظ': 104,\n", + " 'ع': 105,\n", + " 'غ': 106,\n", + " 'ف': 107,\n", + " 'ق': 108,\n", + " 'ك': 109,\n", + " 'ل': 110,\n", + " 'م': 111,\n", + " 'ن': 112,\n", + " 'ه': 113,\n", + " 'و': 114,\n", + " 'ى': 115,\n", + " 'ي': 116,\n", + " 'ّ': 117,\n", + " 'ٓ': 118,\n", + " '٠': 119,\n", + " '١': 120,\n", + " '٢': 121,\n", + " '٣': 122,\n", + " '٤': 123,\n", + " '٥': 124,\n", + " '٦': 125,\n", + " '٧': 126,\n", + " '٨': 127,\n", + " '٩': 128,\n", + " 'چ': 129,\n", + " 'ڠ': 130,\n", + " 'ڤ': 131,\n", + " 'ڬ': 132,\n", + " 'ڽ': 133,\n", + " 'ۏ': 134,\n", + " '﴾': 135,\n", + " '﴿': 136\n", + "}\n", + "rev_left_dict = {v: k for k, v in left_dict.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f90c5331", + "metadata": {}, + "outputs": [], + "source": [ + "class Encoder:\n", + " def __init__(self, dict):\n", + " self.dict = dict\n", + " self.vocab_size = len(self.dict)\n", + "\n", + " def encode(self, s):\n", + " s = [left_dict[c] for c in s] + [1]\n", + " return s\n", + " \n", + " def decode(self, ids):\n", + " return ''.join([rev_left_dict[i] for i in ids if i > 3])\n", + "\n", + "\n", + "encoder = Encoder(left_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "328fdfd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs\n", + "Wall time: 3.81 µs\n" + ] + }, + { + "data": { + "text/plain": [ + "[71, 53, 77, 53, 1]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "encoder.encode('saya')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "26507817", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'saya'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder.decode([71, 53, 77, 53, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ef7266b3", + "metadata": {}, + "outputs": [], + "source": [ + "from tensor2tensor.data_generators import problem\n", + "from tensor2tensor.data_generators import text_problems\n", + "from tensor2tensor.utils import registry\n", + "from tqdm import tqdm\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6cf25fc8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "137" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(left_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e3b5d097", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['كاوسن كڤولاوان سڤراتلي يڠ',\n", + " 'ڤرليمين ڤرسكوتوان اونتوق',\n", + " 'ڤنوبوهن تامن سينر',\n", + " 'ڤريڠكت كمنترين، كاتڽ.',\n", + " 'تله مندرم سباڽق تيڬ',\n", + " 'هاري اين،',\n", + " 'برتوليرنسي\"',\n", + " 'مڠيسهكان',\n", + " 'سوڠ-قواڠ',\n", + " 'سيبير دڠن باجو بياسا.']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open('jawi-set.json') as fopen:\n", + " jawi = json.load(fopen)\n", + "jawi['train'][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ebb932f5", + "metadata": {}, + "outputs": [], + "source": [ + "with open('rumi-set.json') as fopen:\n", + " rumi = json.load(fopen)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "76cc612d", + "metadata": {}, + "outputs": [], + "source": [ + "train_X = rumi['train']\n", + "train_Y = jawi['train']\n", + "\n", + "test_X = rumi['test']\n", + "test_Y = jawi['test']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1e4fa939", + "metadata": {}, + "outputs": [], + "source": [ + "from tensor2tensor.data_generators import problem\n", + "from tensor2tensor.data_generators import text_problems\n", + "from tensor2tensor.utils import registry\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5fe362b2", + "metadata": {}, + "outputs": [], + "source": [ + "@registry.register_problem\n", + "class Jawi(text_problems.Text2TextProblem):\n", + " @property\n", + " def approx_vocab_size(self):\n", + " return len(left_dict)\n", + "\n", + " @property\n", + " def is_generate_per_split(self):\n", + " # generate_data will shard the data into TRAIN and EVAL for us.\n", + " return False\n", + "\n", + " @property\n", + " def dataset_splits(self):\n", + " return [\n", + " {'split': problem.DatasetSplit.EVAL, 'shards': 1},\n", + " ]\n", + "\n", + " def generate_samples(self, data_dir, tmp_dir, dataset_split):\n", + "\n", + " for i in tqdm(range(len(test_X))):\n", + " l = encoder.encode(test_X[i])\n", + " r = encoder.encode(test_Y[i])\n", + " yield {'inputs': l, 'targets': r}\n", + "\n", + " def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):\n", + "\n", + " generator = self.generate_samples(data_dir, tmp_dir, dataset_split)\n", + " for sample in generator:\n", + " yield sample" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a0ca7c48", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e4746e6e", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = os.path.expanduser('t2t-rumi-jawi/data')\n", + "TMP_DIR = os.path.expanduser('t2t-rumi-jawi/tmp')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "17a80072", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_379351/420477998.py:1: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + " 0%| | 0/55346 [00:00<?, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 0.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 0.\n", + "100%|██████████| 55346/55346 [00:01<00:00, 43998.08it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generated 55346 Examples\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "INFO:tensorflow:Generated 55346 Examples\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Shuffling data...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Shuffling data...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Data shuffled.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Data shuffled.\n" + ] + } + ], + "source": [ + "tf.gfile.MakeDirs(DATA_DIR)\n", + "tf.gfile.MakeDirs(TMP_DIR)\n", + "\n", + "from tensor2tensor.utils import registry\n", + "from tensor2tensor import problems\n", + "\n", + "PROBLEM = 'jawi'\n", + "t2t_problem = problems.problem(PROBLEM)\n", + "t2t_problem.generate_data(DATA_DIR, TMP_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f773d115", + "metadata": {}, + "outputs": [], + "source": [ + "# tf.train.list_variables('t2t-phoneme/train-small/model.ckpt-1100')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e8a4ad9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tf1", + "language": "python", + "name": "tf1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/session/rumi-jawi/prepare-t2t-train.ipynb b/session/rumi-jawi/prepare-t2t-train.ipynb new file mode 100644 index 00000000..8b27d512 --- /dev/null +++ b/session/rumi-jawi/prepare-t2t-train.ipynb @@ -0,0 +1,693 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a3844223", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fade1c44", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "files = ['news-windows.json', 'wikipedia-1word.json', 'wikipedia-windows.json']\n", + "left, right = [], []\n", + "for f in files:\n", + " with open(f) as fopen:\n", + " data = json.load(fopen)\n", + " for d in data:\n", + " if len(d) == 2:\n", + " if len(d[0]) and len(d[1]):\n", + " left.append(d[0])\n", + " right.append(d[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "32da9c19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0, 1, 2, 3]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PAD = 0\n", + "EOS = 1\n", + "UNK = 2\n", + "GO = 3\n", + "[PAD, EOS, UNK, GO]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5ed78d87", + "metadata": {}, + "outputs": [], + "source": [ + "left_dict = [PAD, EOS, UNK, GO] + sorted(set(list(''.join(left))) | set(list(''.join(right))))\n", + "left_dict = {c: no for no, c in enumerate(left_dict)}\n", + "rev_left_dict = {v: k for k, v in left_dict.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2f273fdc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0,\n", + " 1: 1,\n", + " 2: 2,\n", + " 3: 3,\n", + " ' ': 4,\n", + " '!': 5,\n", + " '\"': 6,\n", + " \"'\": 7,\n", + " '(': 8,\n", + " ')': 9,\n", + " '+': 10,\n", + " ',': 11,\n", + " '-': 12,\n", + " '.': 13,\n", + " '0': 14,\n", + " '1': 15,\n", + " '2': 16,\n", + " '3': 17,\n", + " '4': 18,\n", + " '5': 19,\n", + " '6': 20,\n", + " '7': 21,\n", + " '8': 22,\n", + " '9': 23,\n", + " ':': 24,\n", + " ';': 25,\n", + " '?': 26,\n", + " 'A': 27,\n", + " 'B': 28,\n", + " 'C': 29,\n", + " 'D': 30,\n", + " 'E': 31,\n", + " 'F': 32,\n", + " 'G': 33,\n", + " 'H': 34,\n", + " 'I': 35,\n", + " 'J': 36,\n", + " 'K': 37,\n", + " 'L': 38,\n", + " 'M': 39,\n", + " 'N': 40,\n", + " 'O': 41,\n", + " 'P': 42,\n", + " 'Q': 43,\n", + " 'R': 44,\n", + " 'S': 45,\n", + " 'T': 46,\n", + " 'U': 47,\n", + " 'V': 48,\n", + " 'W': 49,\n", + " 'X': 50,\n", + " 'Y': 51,\n", + " 'Z': 52,\n", + " 'a': 53,\n", + " 'b': 54,\n", + " 'c': 55,\n", + " 'd': 56,\n", + " 'e': 57,\n", + " 'f': 58,\n", + " 'g': 59,\n", + " 'h': 60,\n", + " 'i': 61,\n", + " 'j': 62,\n", + " 'k': 63,\n", + " 'l': 64,\n", + " 'm': 65,\n", + " 'n': 66,\n", + " 'o': 67,\n", + " 'p': 68,\n", + " 'q': 69,\n", + " 'r': 70,\n", + " 's': 71,\n", + " 't': 72,\n", + " 'u': 73,\n", + " 'v': 74,\n", + " 'w': 75,\n", + " 'x': 76,\n", + " 'y': 77,\n", + " 'z': 78,\n", + " '،': 79,\n", + " '؟': 80,\n", + " 'ء': 81,\n", + " 'آ': 82,\n", + " 'أ': 83,\n", + " 'ؤ': 84,\n", + " 'إ': 85,\n", + " 'ئ': 86,\n", + " 'ا': 87,\n", + " 'ب': 88,\n", + " 'ة': 89,\n", + " 'ت': 90,\n", + " 'ث': 91,\n", + " 'ج': 92,\n", + " 'ح': 93,\n", + " 'خ': 94,\n", + " 'د': 95,\n", + " 'ذ': 96,\n", + " 'ر': 97,\n", + " 'ز': 98,\n", + " 'س': 99,\n", + " 'ش': 100,\n", + " 'ص': 101,\n", + " 'ض': 102,\n", + " 'ط': 103,\n", + " 'ظ': 104,\n", + " 'ع': 105,\n", + " 'غ': 106,\n", + " 'ف': 107,\n", + " 'ق': 108,\n", + " 'ك': 109,\n", + " 'ل': 110,\n", + " 'م': 111,\n", + " 'ن': 112,\n", + " 'ه': 113,\n", + " 'و': 114,\n", + " 'ى': 115,\n", + " 'ي': 116,\n", + " 'ّ': 117,\n", + " 'ٓ': 118,\n", + " '٠': 119,\n", + " '١': 120,\n", + " '٢': 121,\n", + " '٣': 122,\n", + " '٤': 123,\n", + " '٥': 124,\n", + " '٦': 125,\n", + " '٧': 126,\n", + " '٨': 127,\n", + " '٩': 128,\n", + " 'چ': 129,\n", + " 'ڠ': 130,\n", + " 'ڤ': 131,\n", + " 'ڬ': 132,\n", + " 'ڽ': 133,\n", + " 'ۏ': 134,\n", + " '﴾': 135,\n", + " '﴿': 136}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f90c5331", + "metadata": {}, + "outputs": [], + "source": [ + "class Encoder:\n", + " def __init__(self, dict):\n", + " self.dict = dict\n", + " self.vocab_size = len(self.dict)\n", + "\n", + " def encode(self, s):\n", + " s = [left_dict[c] for c in s] + [1]\n", + " return s\n", + " \n", + " def decode(self, ids):\n", + " return ''.join([rev_left_dict[i] for i in ids if i > 3])\n", + "\n", + "\n", + "encoder = Encoder(left_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "328fdfd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\n", + "Wall time: 4.05 µs\n" + ] + }, + { + "data": { + "text/plain": [ + "[71, 53, 77, 53, 1]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "encoder.encode('saya')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "26507817", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'saya'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder.decode([71, 53, 77, 53, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ef7266b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-31 15:00:58.005135: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n" + ] + } + ], + "source": [ + "from tensor2tensor.data_generators import problem\n", + "from tensor2tensor.data_generators import text_problems\n", + "from tensor2tensor.utils import registry\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6cf25fc8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "137" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(left_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e3b5d097", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['كاوسن كڤولاوان سڤراتلي يڠ',\n", + " 'ڤرليمين ڤرسكوتوان اونتوق',\n", + " 'ڤنوبوهن تامن سينر',\n", + " 'ڤريڠكت كمنترين، كاتڽ.',\n", + " 'تله مندرم سباڽق تيڬ',\n", + " 'هاري اين،',\n", + " 'برتوليرنسي\"',\n", + " 'مڠيسهكان',\n", + " 'سوڠ-قواڠ',\n", + " 'سيبير دڠن باجو بياسا.']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open('jawi-set.json') as fopen:\n", + " jawi = json.load(fopen)\n", + "jawi['train'][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "ebb932f5", + "metadata": {}, + "outputs": [], + "source": [ + "with open('rumi-set.json') as fopen:\n", + " rumi = json.load(fopen)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "76cc612d", + "metadata": {}, + "outputs": [], + "source": [ + "train_X = rumi['train']\n", + "train_Y = jawi['train']\n", + "\n", + "test_X = rumi['test']\n", + "test_Y = jawi['test']" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "1e4fa939", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-01 13:20:10.686239: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n" + ] + } + ], + "source": [ + "from tensor2tensor.data_generators import problem\n", + "from tensor2tensor.data_generators import text_problems\n", + "from tensor2tensor.utils import registry\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "5fe362b2", + "metadata": {}, + "outputs": [], + "source": [ + "@registry.register_problem\n", + "class Jawi(text_problems.Text2TextProblem):\n", + " @property\n", + " def approx_vocab_size(self):\n", + " return len(left_dict)\n", + "\n", + " @property\n", + " def is_generate_per_split(self):\n", + " # generate_data will shard the data into TRAIN and EVAL for us.\n", + " return False\n", + "\n", + " @property\n", + " def dataset_splits(self):\n", + " return [\n", + " {'split': problem.DatasetSplit.TRAIN, 'shards': 20},\n", + " ]\n", + "\n", + " def generate_samples(self, data_dir, tmp_dir, dataset_split):\n", + "\n", + " for i in tqdm(range(len(train_X))):\n", + " l = encoder.encode(train_X[i])\n", + " r = encoder.encode(train_Y[i])\n", + " yield {'inputs': l, 'targets': r}\n", + "\n", + " def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):\n", + "\n", + " generator = self.generate_samples(data_dir, tmp_dir, dataset_split)\n", + " for sample in generator:\n", + " yield sample" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a0ca7c48", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "e4746e6e", + "metadata": {}, + "outputs": [], + "source": [ + "os.system('rm -rf t2t-rumi-jawi/data')\n", + "DATA_DIR = os.path.expanduser('t2t-rumi-jawi/data')\n", + "TMP_DIR = os.path.expanduser('t2t-rumi-jawi/tmp')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "17a80072", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_378277/420477998.py:1: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + " 0%| | 0/498106 [00:00<?, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 0.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 0.\n", + " 20%|█▉ | 98205/498106 [00:02<00:08, 44725.60it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 100000.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 100000.\n", + " 40%|███▉ | 196924/498106 [00:04<00:06, 45045.99it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 200000.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 200000.\n", + " 59%|█████▉ | 295770/498106 [00:06<00:04, 44349.16it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 300000.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 300000.\n", + " 80%|████████ | 398686/498106 [00:08<00:02, 44295.16it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 400000.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generating case 400000.\n", + "100%|██████████| 498106/498106 [00:11<00:00, 44533.40it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Generated 498106 Examples\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "INFO:tensorflow:Generated 498106 Examples\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Shuffling data...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Shuffling data...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Data shuffled.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Data shuffled.\n" + ] + } + ], + "source": [ + "tf.gfile.MakeDirs(DATA_DIR)\n", + "tf.gfile.MakeDirs(TMP_DIR)\n", + "\n", + "from tensor2tensor.utils import registry\n", + "from tensor2tensor import problems\n", + "\n", + "PROBLEM = 'jawi'\n", + "t2t_problem = problems.problem(PROBLEM)\n", + "t2t_problem.generate_data(DATA_DIR, TMP_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f773d115", + "metadata": {}, + "outputs": [], + "source": [ + "# tf.train.list_variables('t2t-phoneme/train-small/model.ckpt-1100')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e8a4ad9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tf1", + "language": "python", + "name": "tf1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/session/rumi-jawi/t2t_small.py b/session/rumi-jawi/t2t_small.py new file mode 100644 index 00000000..d58b1b59 --- /dev/null +++ b/session/rumi-jawi/t2t_small.py @@ -0,0 +1,240 @@ +import os + +os.environ['CUDA_VISIBLE_DEVICES'] = '' + +from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_problems +from tensor2tensor.data_generators import translate +from tensor2tensor.utils import registry +from tensor2tensor import problems +import tensorflow as tf +import os +import logging + +left_dict = { + 0: 0, + 1: 1, + 2: 2, + 3: 3, + ' ': 4, + '!': 5, + '"': 6, + "'": 7, + '(': 8, + ')': 9, + '+': 10, + ',': 11, + '-': 12, + '.': 13, + '0': 14, + '1': 15, + '2': 16, + '3': 17, + '4': 18, + '5': 19, + '6': 20, + '7': 21, + '8': 22, + '9': 23, + ':': 24, + ';': 25, + '?': 26, + 'A': 27, + 'B': 28, + 'C': 29, + 'D': 30, + 'E': 31, + 'F': 32, + 'G': 33, + 'H': 34, + 'I': 35, + 'J': 36, + 'K': 37, + 'L': 38, + 'M': 39, + 'N': 40, + 'O': 41, + 'P': 42, + 'Q': 43, + 'R': 44, + 'S': 45, + 'T': 46, + 'U': 47, + 'V': 48, + 'W': 49, + 'X': 50, + 'Y': 51, + 'Z': 52, + 'a': 53, + 'b': 54, + 'c': 55, + 'd': 56, + 'e': 57, + 'f': 58, + 'g': 59, + 'h': 60, + 'i': 61, + 'j': 62, + 'k': 63, + 'l': 64, + 'm': 65, + 'n': 66, + 'o': 67, + 'p': 68, + 'q': 69, + 'r': 70, + 's': 71, + 't': 72, + 'u': 73, + 'v': 74, + 'w': 75, + 'x': 76, + 'y': 77, + 'z': 78, + '،': 79, + '؟': 80, + 'ء': 81, + 'آ': 82, + 'أ': 83, + 'ؤ': 84, + 'إ': 85, + 'ئ': 86, + 'ا': 87, + 'ب': 88, + 'ة': 89, + 'ت': 90, + 'ث': 91, + 'ج': 92, + 'ح': 93, + 'خ': 94, + 'د': 95, + 'ذ': 96, + 'ر': 97, + 'ز': 98, + 'س': 99, + 'ش': 100, + 'ص': 101, + 'ض': 102, + 'ط': 103, + 'ظ': 104, + 'ع': 105, + 'غ': 106, + 'ف': 107, + 'ق': 108, + 'ك': 109, + 'ل': 110, + 'م': 111, + 'ن': 112, + 'ه': 113, + 'و': 114, + 'ى': 115, + 'ي': 116, + 'ّ': 117, + 'ٓ': 118, + '٠': 119, + '١': 120, + '٢': 121, + '٣': 122, + '٤': 123, + '٥': 124, + '٦': 125, + '٧': 126, + '٨': 127, + '٩': 128, + 'چ': 129, + 'ڠ': 130, + 'ڤ': 131, + 'ڬ': 132, + 'ڽ': 133, + 'ۏ': 134, + '﴾': 135, + '﴿': 136 +} +rev_left_dict = {v: k for k, v in left_dict.items()} + +logger = logging.getLogger() +tf.logging.set_verbosity(tf.logging.DEBUG) + + +class Encoder: + def __init__(self, dict): + self.dict = dict + self.vocab_size = len(self.dict) + + def encode(self, s): + s = [left_dict[c] for c in s] + [1] + return s + + def decode(self, ids): + return ''.join([rev_left_dict[i] for i in ids if i > 3]) + + +@registry.register_problem +class Jawi(text_problems.Text2TextProblem): + @property + def approx_vocab_size(self): + return 32000 + + @property + def is_generate_per_split(self): + return False + + @property + def dataset_splits(self): + return [ + {'split': problem.DatasetSplit.TRAIN, 'shards': 20}, + {'split': problem.DatasetSplit.EVAL, 'shards': 1}, + ] + + def feature_encoders(self, data_dir): + encoder = Encoder(left_dict) + return {'inputs': encoder, 'targets': encoder} + + +os.system('mkdir t2t-rumi-jawi/train-small') +DATA_DIR = os.path.expanduser('t2t-rumi-jawi/data') +TMP_DIR = os.path.expanduser('t2t-rumi-jawi/tmp') +TRAIN_DIR = os.path.expanduser('t2t-rumi-jawi/train-small') + +PROBLEM = 'jawi' +t2t_problem = problems.problem(PROBLEM) + +train_steps = 500000 +eval_steps = 10 +batch_size = 768 +save_checkpoints_steps = 25000 +ALPHA = 0.1 +schedule = 'continuous_train_and_eval' +MODEL = 'transformer' +HPARAMS = 'transformer_small' + +from tensor2tensor.utils.trainer_lib import create_run_config, create_experiment +from tensor2tensor.utils.trainer_lib import create_hparams +from tensor2tensor.utils import registry +from tensor2tensor import models +from tensor2tensor import problems + +hparams = create_hparams(HPARAMS) +hparams.batch_size = batch_size +hparams.learning_rate = ALPHA +hparams.max_length = 128 + +RUN_CONFIG = create_run_config( + model_dir=TRAIN_DIR, + model_name=MODEL, + save_checkpoints_steps=save_checkpoints_steps, + num_gpus=0, +) + +tensorflow_exp_fn = create_experiment( + run_config=RUN_CONFIG, + hparams=hparams, + model_name=MODEL, + problem_name=PROBLEM, + data_dir=DATA_DIR, + train_steps=train_steps, + eval_steps=eval_steps, + # use_xla=True # For acceleration +) + +tensorflow_exp_fn.train_and_evaluate() diff --git a/setup.py b/setup.py index f5945507..eb0841ad 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def readme(): setuptools.setup( name=__packagename__, packages=setuptools.find_packages(), - version='4.7.5', + version='4.8.0', python_requires='>=3.6.*', description='Natural-Language-Toolkit for bahasa Malaysia, powered by Deep Learning Tensorflow.', long_description=readme(),