Skip to content

Commit

Permalink
fix nanot5
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Apr 14, 2024
1 parent 14582a3 commit 3b5ce7c
Show file tree
Hide file tree
Showing 8 changed files with 13,036 additions and 241 deletions.
135 changes: 135 additions & 0 deletions pretrained-model/nanoT5/hf_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from dataclasses import dataclass, field
import transformers
from transformers import (
AutoConfig,
AutoModelForCausalLM,
T5ForConditionalGeneration,
AutoTokenizer,
HfArgumentParser,
Trainer,
TrainingArguments,
default_data_collator,
DataCollatorWithPadding,
DataCollatorForLanguageModeling,
is_torch_tpu_available,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from typing import Optional
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import torch
import logging
import numpy as np
from utils.copied_utils import (
compute_input_and_target_lengths,
DataCollatorForT5MLM,
tokenize_function,
DataCollatorForNI,
)


class UInt16(Encoding):
def encode(self, obj) -> bytes:
return obj.tobytes()

def decode(self, data: bytes):
return np.frombuffer(data, np.uint16)


_encodings['uint16'] = UInt16


@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""

model_name_or_path: Optional[str] = field(
default=None,
metadata={
"help": (
"The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
)
},
)


@dataclass
class DataTrainingArguments:
train_file: Optional[str] = field(
default=None, metadata={
"help": "The input training data file (a text file)."})


def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()

# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level
# at info here to have that default.
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

last_checkpoint = None
if os.path.isdir(
training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)

tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
config = AutoConfig.from_pretrained(model_args.model_name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model_args.model_name_or_path)

before_mask_input_length, target_length = compute_input_and_target_lengths(
inputs_length=512,
noise_density=0.15,
mean_noise_span_length=3.0,
)

data_collator = DataCollatorForT5MLM(
tokenizer=tokenizer,
noise_density=0.15,
mean_noise_span_length=3.0,
input_length=512,
target_length=target_length,
pad_token_id=config.pad_token_id,
)
dataset = DatasetFixed(data_args.train_file)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
eval_dataset=None,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=None,
preprocess_logits_for_metrics=None,
)

checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()
215 changes: 204 additions & 11 deletions pretrained-model/nanoT5/prepare-tokenizer-base-model.ipynb

Large diffs are not rendered by default.

210 changes: 141 additions & 69 deletions pretrained-model/nanoT5/prepare-tokenizer-large-model.ipynb

Large diffs are not rendered by default.

53 changes: 25 additions & 28 deletions pretrained-model/nanoT5/prepare-tokenizer-small-model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,15 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
]
}
],
"source": [
"from transformers import GPT2Tokenizer\n",
"from transformers import (\n",
Expand All @@ -14,8 +22,7 @@
")\n",
"from tokenizers import AddedToken\n",
"\n",
"tokenizer = GPT2Tokenizer('/home/husein/dev/malay-dataset/prepare-llm/32k-vocab.json',\n",
" '/home/husein/dev/malay-dataset/prepare-llm/32k-merges.txt')\n",
"tokenizer = AutoTokenizer.from_pretrained('malaysia-ai/bpe-tokenizer')\n",
"tokenizer_t5 = AutoTokenizer.from_pretrained('google/t5-v1_1-base')\n",
"additional = []\n",
"for t in tokenizer_t5.additional_special_tokens:\n",
Expand All @@ -26,20 +33,18 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('./out-small-1.1/tokenizer_config.json',\n",
" './out-small-1.1/special_tokens_map.json',\n",
" './out-small-1.1/vocab.json',\n",
" './out-small-1.1/merges.txt',\n",
" './out-small-1.1/added_tokens.json')"
" './out-small-1.1/tokenizer.json')"
]
},
"execution_count": 7,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -50,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -62,7 +67,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -74,7 +79,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -85,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -94,34 +99,26 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 343M\r\n",
"-rw-r--r-- 1 husein husein 2.6K Jul 4 12:48 added_tokens.json\r\n",
"-rw-r--r-- 1 husein husein 760 Jul 4 12:49 config.json\r\n",
"-rw-r--r-- 1 husein husein 297K Jul 4 12:48 merges.txt\r\n",
"-rw-r--r-- 1 husein husein 342M Jul 4 12:49 pytorch_model.bin\r\n",
"-rw-r--r-- 1 husein husein 15K Jul 4 12:48 special_tokens_map.json\r\n",
"-rw-r--r-- 1 husein husein 714 Jul 4 12:48 tokenizer_config.json\r\n",
"-rw-r--r-- 1 husein husein 631K Jul 4 12:48 vocab.json\r\n"
"-rw-r--r-- 1 ubuntu ubuntu 789 Apr 14 02:28 config.json\r\n",
"-rw-r--r-- 1 ubuntu ubuntu 142 Apr 14 02:28 generation_config.json\r\n",
"-rw-r--r-- 1 ubuntu ubuntu 342M Apr 14 02:28 model.safetensors\r\n",
"-rw-r--r-- 1 ubuntu ubuntu 15K Apr 14 02:28 special_tokens_map.json\r\n",
"-rw-r--r-- 1 ubuntu ubuntu 1.3M Apr 14 02:28 tokenizer.json\r\n",
"-rw-r--r-- 1 ubuntu ubuntu 21K Apr 14 02:28 tokenizer_config.json\r\n"
]
}
],
"source": [
"!ls -lh out-small-1.1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -140,7 +137,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
9 changes: 4 additions & 5 deletions pretrained-model/nanoT5/run-base.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
rm -rf /dev/shm/*
WANDB_PROJECT=nanoT5-base \
~/.local/bin/torchrun --nproc_per_node 4 \
-m train \
model.name="/home/ubuntu/malaya/pretrained-model/nanoT5/out-base-1.1" \
model.random_init=false \
data.filename.train="/home/ubuntu/nanot5-512" \
data.filename.test="/home/ubuntu/nanot5-512" \
data.filename.train="/home/ubuntu/mosaic-nanot5-512" \
data.filename.test="/home/ubuntu/mosaic-nanot5-512" \
data.input_length=512 \
checkpoint.every_steps=1000 \
optim.total_steps=65536 \
optim.total_steps=655360 \
optim.name=adamwscale \
optim.batch_size=128 \
optim.batch_size=158 \
optim.lr_scheduler=cosine \
optim.grad_acc=2 \
optim.grad_clip=1.0 \
Expand Down
5 changes: 2 additions & 3 deletions pretrained-model/nanoT5/run-small.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
rm -rf /dev/shm/*
WANDB_PROJECT=nanoT5-small \
~/.local/bin/torchrun --nproc_per_node 4 \
-m train \
model.name="/home/ubuntu/malaya/pretrained-model/nanoT5/out-small-1.1" \
model.random_init=false \
data.filename.train="/home/ubuntu/nanot5-512" \
data.filename.test="/home/ubuntu/nanot5-512" \
data.filename.train="/home/ubuntu/mosaic-nanot5-512" \
data.filename.test="/home/ubuntu/mosaic-nanot5-512" \
data.input_length=512 \
checkpoint.every_steps=2000 \
optim.total_steps=65536 \
Expand Down
Loading

0 comments on commit 3b5ce7c

Please sign in to comment.