Skip to content

Commit

Permalink
Merge pull request #523 from Anhforth/fit_torch
Browse files Browse the repository at this point in the history
Fit torch
  • Loading branch information
BAAI-OpenPlatform authored Aug 7, 2023
2 parents d3722a6 + 550bf29 commit d878852
Show file tree
Hide file tree
Showing 18 changed files with 46 additions and 37 deletions.
17 changes: 8 additions & 9 deletions examples/Aquila/Aquila-chat/Aquila-chat-lora.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
batch_size: 4
batch_size: 1
epochs: 5
gradient_accumulation_steps: 1
lr: 3.0e-4
lr: 4.0e-5
warm_up: 0.01
warm_up_iters: 200
lora_r: 16
lora_r: 8
lora_alpha: 32
epochs: 30000000
save_interval: 300
log_interval: 10
save_interval: 200
log_interval: 1
bmt_cpu_offload: False
bmt_pre_load: True

bmt_lr_decay_style: 'cosine'
save_optim: True
save_rng: True

enable_flash_attn_models: False
eps: 1.0e-8
lora: True

Expand Down
6 changes: 3 additions & 3 deletions examples/Aquila/Aquila-chat/aquila_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
import sys
sys.exit(0)

print(f"Trainer effective env_args={env_args} local_rank={trainer.local_rank}", flush=True)
print(f"Trainer effective env_args={env_args} local_rank={os.environ['LOCAL_RANK']}", flush=True)

checkpoints = env_args.pre_load_dir

Expand All @@ -78,7 +78,7 @@
# avoid sync loading models in case of Mem OOM
if env_args.bmt_async_load:
import time
time.sleep(10*60*(trainer.local_rank%4))
time.sleep(10*60*(os.environ['LOCAL_RANK']%4))


config_file = os.path.join(cache_dir, 'config.json')
Expand All @@ -98,7 +98,7 @@

trainer.pre_train(model)

# print('*'*20, "model", model, flush=True)
print('*'*20, "model", model, flush=True)

assert env_args.enable_sft_dataset_dir is not None and \
env_args.enable_sft_dataset_file is not None
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-chat/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ OPTS=" --batch_size $BATCH_SIZE \
## Trigger job on Each Node when bmt or ddp.

mkdir -p $PRE_LOAD_DIR
python -m torch.distributed.launch \
torchrun \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
Expand Down
7 changes: 6 additions & 1 deletion examples/Aquila/Aquila-chat/generate_chat_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
use_cache=True,
fp16=True,
device='cuda',
adapter_dir='/data2/yzd/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquila_experiment75new/2023070515/') # eg: /mnt/yzd/git/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquila_experiment/2023062909
adapter_dir='/data2/yzd/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquila_experiment/2023080216/') # Directory to adapter_model.bin and adapter_config.json
model = loader.get_model()

tokenizer = loader.get_tokenizer()
Expand All @@ -32,6 +32,11 @@

texts = [
"Find the product of the numbers: 5 and 8",
"Create a list of potential topics for a company newsletter",
"Explain the theory of relativity in simple terms.",
"Write a short story about a dragon and a knight.",
"翻译成英文: '我饿了想吃饭'",
"write a fairy tale for me",
]

for text in texts:
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-chat/hostfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
192.168.21.7 slots=2
192.168.20.3 slots=1
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-code/aquila_code_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
# avoid sync loading models in case of Mem OOM
if env_args.bmt_async_load:
import time
time.sleep(10 * 60 * (trainer.local_rank % 4))
time.sleep(10 * 60 * (os.environ['LOCAL_RANK'] % 4))

config_file = os.path.join(cache_dir, 'config.json')
model = AQUILAModel.init_from_json(config_file=config_file)
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-code/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ OPTS=" --batch_size $BATCH_SIZE \
## Trigger job on Each Node when bmt or ddp.

mkdir -p $PRE_LOAD_DIR
python -m torch.distributed.launch \
torchrun \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
Expand Down
5 changes: 2 additions & 3 deletions examples/Aquila/Aquila-pretrain/aquila_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import torch
from torch.utils.data import Dataset
import gc

gc.collect()
torch.cuda.empty_cache()
from flagai.auto_model.auto_loader import AutoLoader
Expand Down Expand Up @@ -62,7 +61,7 @@
import sys
sys.exit(0)

print(f"Trainer effective env_args={env_args} local_rank={trainer.local_rank}",
print(f"Trainer effective env_args={env_args} local_rank={os.environ['LOCAL_RANK']}",
flush=True)
checkpoints = env_args.pre_load_dir
model_name = env_args.model_name
Expand All @@ -77,7 +76,7 @@
# avoid sync loading models in case of Mem OOM
if env_args.bmt_async_load:
import time
time.sleep(10 * 60 * (trainer.local_rank % 4))
time.sleep(10 * 60 * (os.environ['LOCAL_RANK'] % 4))

config_file = os.path.join(cache_dir, 'config.json')
model = AQUILAModel.init_from_json(config_file=config_file)
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-pretrain/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ OPTS=" --batch_size $BATCH_SIZE \
## Trigger job on Each Node when bmt or ddp.

mkdir -p $PRE_LOAD_DIR
python -m torch.distributed.launch \
torchrun \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
Expand Down
4 changes: 2 additions & 2 deletions examples/Aquila/aquila_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
import sys
sys.exit(0)

print(f"Trainer effective env_args={env_args} local_rank={trainer.local_rank}",
print(f"Trainer effective env_args={env_args} local_rank={os.environ['LOCAL_RANK']}",
flush=True)
checkpoints = env_args.pre_load_dir
model_name = env_args.model_name
Expand All @@ -77,7 +77,7 @@
# avoid sync loading models in case of Mem OOM
if env_args.bmt_async_load:
import time
time.sleep(10 * 60 * (trainer.local_rank % 4))
time.sleep(10 * 60 * (os.environ['LOCAL_RANK'] % 4))

config_file = os.path.join(cache_dir, 'config.json')
model = AQUILAModel.init_from_json(config_file=config_file)
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ OPTS=" --batch_size $BATCH_SIZE \
## Trigger job on Each Node when bmt or ddp.

mkdir -p $PRE_LOAD_DIR
python -m torch.distributed.launch \
torchrun \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/hostfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
192.168.21.2 slots=4
192.168.20.3 slots=2
2 changes: 1 addition & 1 deletion examples/swinv2/inference_swinv2.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tqdm import tqdm
from flagai.auto_model.auto_loader import AutoLoader

data_path = "./imagenet2012/"
data_path = "/data2/yzd/FlagAI/examples/swinv2/imagenet2012/"

# swinv2 model_name support:
# 1. swinv2-base-patch4-window16-256,
Expand Down
3 changes: 2 additions & 1 deletion flagai/env_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def __init__(self,
self.parser.add_argument('--num_nodes', default=num_nodes, type=int, help='start training from saved checkpoint')
self.parser.add_argument('--num_gpus', default=num_gpus, type=int, help='start training from saved checkpoint')
self.parser.add_argument('--not_call_launch', action="store_true", help='start training from saved checkpoint')
self.parser.add_argument('--local_rank', default=0, type=int, help='start training from saved checkpoint')
self.parser.add_argument('--local-rank', default=0, type=int, help='start training from saved checkpoint')

self.parser.add_argument('--wandb', default=wandb, type=str2bool, help='whether to use wandb')
self.parser.add_argument('--wandb_dir', default=wandb_dir, type=str, help='wandb directory')
Expand Down Expand Up @@ -204,6 +204,7 @@ def parse_args(self):
if args.env_type == "pytorch":
# not need the "not_call_launch" parameter
args.not_call_launch = True
print(args)
for arg in vars(args):
# change string format list to back to python list object
value = getattr(args, arg)
Expand Down
7 changes: 5 additions & 2 deletions flagai/env_trainer_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,10 @@ def __init__(self,
training_paras = get_args_list(env_args)
self.rank = int(os.environ.get('RANK', 0))
self.world_size = int(os.environ.get('WORLD_SIZE', 1))
self.local_rank = env_args.local_rank
# self.local_rank = env_args.local_rank
print(os.environ['LOCAL_RANK'])
self.local_rank = int(os.environ['LOCAL_RANK'])

log_dist("not_call_launch: {}".format(self.not_call_launch))
# Implement for AutoLaunch
# >>> python train.py # will call get_dist_args()
Expand Down Expand Up @@ -196,7 +199,7 @@ def initialize_distributed(self):
device = self.rank % torch.cuda.device_count()
if self.local_rank is not None:
device = self.local_rank
torch.cuda.set_device(device)
torch.cuda.set_device(int(device))
# Call the init process
init_method = 'tcp://'
self.master_ip = os.getenv('MASTER_ADDR', 'localhost')
Expand Down
2 changes: 1 addition & 1 deletion flagai/model/predictor/aquila.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def aquila_generate(
total_len = min(2048, max_gen_len + max_prompt_size)

tokens = torch.full((bsz, total_len), 0).cuda().long()
#tokens = torch.full((bsz, total_len), 0).to("cuda:5").long()

for k, t in enumerate(prompt_tokens):
tokens[k, : len(t)] = t.clone().detach().long()
input_text_mask = tokens != 0
Expand Down
7 changes: 4 additions & 3 deletions flagai/model/vision/swinv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def forward(self, x, mask=None):

# cosine attention
attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01))).exp()
logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01)).to(self.logit_scale.device)).exp()
attn = attn * logit_scale

relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)
Expand Down Expand Up @@ -518,7 +518,8 @@ def __init__(self, img_size=224,
patch_norm=True,
pretrained_window_sizes=[0, 0, 0, 0],
checkpoint_activations=False,
num_classes=1000):
num_classes=1000,
**kwargs):
self.num_classes = num_classes
self.img_size = img_size
self.patch_size = patch_size
Expand Down Expand Up @@ -566,7 +567,7 @@ class SwinTransformerV2(BaseModel):

def __init__(self, config, num_classes=1000, **kwargs):
super().__init__(config, **kwargs)
swin_config = SwinTransformerConfig(**config)
swin_config = SwinTransformerConfig(**config.json_config)

embed_dim = swin_config.embed_dim
img_size = swin_config.img_size
Expand Down
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="flagai",
version="v1.7.3",
version="v1.7.5",
description="FlagAI aims to help researchers and developers to freely train and test large-scale models for NLP/CV/VL tasks.",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
Expand All @@ -21,8 +21,8 @@
'sentencepiece>=0.1.96',
'boto3==1.17.32',
'pandas>=1.3.5',
'jieba==0.42.1',
'scikit-learn==1.0.2',
'jieba>=0.42.1',
'scikit-learn>=1.0.2',
'tensorboard>=2.9.0',
'transformers>=4.20.1',
'datasets>=2.0.0',
Expand All @@ -32,13 +32,14 @@
'Pillow>=9.3.0',
'einops>=0.3.0',
'diffusers==0.7.2',
'pytorch-lightning==1.6.5',
'pytorch-lightning>=1.6.5',
'taming-transformers-rom1504==0.0.6',
'rouge-score',
'sacrebleu>=2.3.1',
'jsonlines',
'accelerate',
'PyYAML==5.4.1',
'safetensors',
'timm',
]
)

0 comments on commit d878852

Please sign in to comment.