From b511e1dc9bdcc5cec3c24e4448a7c7f8044a6177 Mon Sep 17 00:00:00 2001 From: huseinzol05 Date: Fri, 6 Dec 2024 20:59:20 +0800 Subject: [PATCH] initial evaluation 2.1 --- .../end-to-end/evaluate/base-2.1-en-ms.ipynb | 335 ++++++++++++++++++ .../end-to-end/evaluate/base-2.1-ind-ms.ipynb | 239 +++++++++++++ .../end-to-end/evaluate/small-2.1-en-ms.ipynb | 294 +++++++++++++++ .../evaluate/small-2.1-ind-ms.ipynb | 231 ++++++++++++ .../nanot5-base-multipack-compile.sh | 28 ++ .../nanot5-small-multipack-compile.sh | 13 +- 6 files changed, 1134 insertions(+), 6 deletions(-) create mode 100644 session/translation/end-to-end/evaluate/base-2.1-en-ms.ipynb create mode 100644 session/translation/end-to-end/evaluate/base-2.1-ind-ms.ipynb create mode 100644 session/translation/end-to-end/evaluate/small-2.1-en-ms.ipynb create mode 100644 session/translation/end-to-end/evaluate/small-2.1-ind-ms.ipynb create mode 100644 session/translation/end-to-end/nanot5-base-multipack-compile.sh diff --git a/session/translation/end-to-end/evaluate/base-2.1-en-ms.ipynb b/session/translation/end-to-end/evaluate/base-2.1-en-ms.ipynb new file mode 100644 index 00000000..35d1735d --- /dev/null +++ b/session/translation/end-to-end/evaluate/base-2.1-en-ms.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3560a64c", + "metadata": {}, + "outputs": [], + "source": [ + "# !wget https://github.com/mesolitica/malaysian-dataset/raw/master/translation/flores200-eval/bjn_Latn.dev\n", + "# !wget https://github.com/mesolitica/malaysian-dataset/raw/master/translation/flores200-eval/eng_Latn.dev\n", + "# !wget https://github.com/mesolitica/malaysian-dataset/raw/master/translation/flores200-eval/ind_Latn.dev\n", + "# !wget https://github.com/mesolitica/malaysian-dataset/raw/master/translation/flores200-eval/jav_Latn.dev\n", + "# !wget https://github.com/mesolitica/malaysian-dataset/raw/master/translation/flores200-eval/zsm_Latn.dev\n", + "# !wget https://github.com/mesolitica/malaysian-dataset/raw/master/translation/flores200-eval/zho_Hans.dev\n", + "# !wget https://github.com/mesolitica/malaysian-dataset/raw/master/translation/flores200-eval/tam_Taml.dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "019bd464", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = '0'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2d713f77", + "metadata": {}, + "outputs": [], + "source": [ + "with open('eng_Latn.dev') as fopen:\n", + " en = fopen.read().split('\\n')\n", + " \n", + "with open('zsm_Latn.dev') as fopen:\n", + " ms = fopen.read().split('\\n')\n", + " \n", + "en_, ms_ = [], []\n", + "for i in range(len(en)):\n", + " if len(en[i]) and len(ms[i]):\n", + " en_.append(en[i])\n", + " ms_.append(ms[i])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "076a2a37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(997, 997)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(en_), len(ms_)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "14d402ec", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "import requests\n", + "import os\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4795de0a", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf base-en-ms\n", + "!mkdir base-en-ms" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "37ac8ece", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "62383ba5f6d846a2b79873571a96b13a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/21.0k [00:00