Skip to content

Commit

Permalink
fix: adds missing support for mcore dist opt and adds test for moe
Browse files Browse the repository at this point in the history
Signed-off-by: Terry Kong <[email protected]>

moe test is all2all

Signed-off-by: Terry Kong <[email protected]>

other params

Signed-off-by: Terry Kong <[email protected]>

fix peft mixtral

Signed-off-by: Terry Kong <[email protected]>

dockerfile bump to be on dev

Signed-off-by: Terry Kong <[email protected]>

just take dockerfile on dev

Signed-off-by: Terry Kong <[email protected]>
  • Loading branch information
terrykong committed Dec 6, 2024
1 parent 3604fc4 commit eab96f2
Show file tree
Hide file tree
Showing 20 changed files with 175 additions and 41 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ jobs:
- sft-llama3
- sft-llama3-cp
- rm-llama3
- dpo-mixtral-ep
- dpo-mixtral-peft-tp-sp
with:
RUNNER: self-hosted-azure
# Fairly aggresive timeout that all functional tests should try to adhere to
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# dpo specific args
dpo:
Expand All @@ -17,6 +18,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_kto.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# kto specific args
kto:
Expand All @@ -17,6 +18,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_ppo_actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

ppo:
# How many steps we train warmup the critic for (without training the policy)
Expand All @@ -21,6 +22,7 @@ trainer:
max_steps: -1 # max PPO steps (-1 to go through the whole train set)
val_check_interval: 10
save_interval: ${.val_check_interval}
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# PPO args to generate the data for training
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_ppo_critic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

ppo:
port: 5556
Expand All @@ -15,6 +16,7 @@ trainer:

# used to set the learning rate scheduler
max_steps: 10000
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# a PyTriton parameter to specify
Expand Down
4 changes: 3 additions & 1 deletion examples/nlp/gpt/conf/gpt_rs_actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

rs:
max_epochs: 1
max_steps: -1 # max rs steps (-1 to go through the whole train set)
val_check_interval: 10
save_interval: ${.val_check_interval}
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# pick up from the model
Expand Down Expand Up @@ -178,4 +180,4 @@ model:
# define fields from the base model's config that should be ignored when merging with this config.
overwrite_base_config:
data:
data_prefix: True
data_prefix: True
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ trainer:
devices: 1
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

sft:
max_epochs: 1
Expand All @@ -15,6 +16,7 @@ trainer:
limit_train_batches: 1.0

limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# can be used to register any custom metrics that require token-by-token generation
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_spin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16-mixed
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# spin specific args
spin:
Expand All @@ -18,6 +19,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/training_rm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# rm specific args
rm:
Expand All @@ -20,6 +21,7 @@ trainer:
# set to float for a percentage
# of the validation dataset
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/critic_server_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def run_training(self, tokens=None, returns=None, prev_values=None, mask=None):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def train_single_step(self, global_batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def run_training(self, dataloader_iter):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/rs.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def run_training(self, dataloader_iter):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/spin.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def train_single_step(self, global_batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def train_single_step(self, batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
56 changes: 44 additions & 12 deletions nemo_aligner/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,31 +101,52 @@ def prepare_for_training_step(ptl_model, zero_grad=True):
param.data_ptr()


# TODO: Delete this once API introduced in NeMo (https://github.com/NVIDIA/NeMo/pull/10803)
# TODO: Update PR to move this logic into staticmethod in nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
def grad_reductions(ptl_model):
# when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
if ptl_model.cfg.get("tensor_model_parallel_size", 1) > 1 and ptl_model.cfg.get("sequence_parallel", False):
ptl_model.allreduce_sequence_parallel_gradients()

if ptl_model.with_distributed_adam:
# synchronize asynchronous grad reductions
# note: not necessary, but reduces performance degradation
# from multiple simultaneous NCCL calls
ptl_model._optimizer._finish_bucket_grad_sync()
# Mcore DistOpt handles this, so we don't have to
if not ptl_model.use_mcore_dist_optim:
ptl_model.megatron_timer_start("allreduce_sequence_parallel_gradients", log_level=1)
ptl_model.allreduce_sequence_parallel_gradients()
ptl_model.megatron_timer_stop("allreduce_sequence_parallel_gradients")

ptl_model.megatron_timer_start("gradient_allreduce", log_level=1)
if ptl_model.use_fsdp:
# Reduce the gradients omitted from FSDP-sharding
ptl_model.allreduce_fsdp_sharding_omitted_gradients()
elif ptl_model.with_distributed_adam:
if not ptl_model.use_mcore_dist_optim:
# synchronize asynchronous grad reductions
# note: not necessary, but reduces performance degradation
# from multiple simultaneous NCCL calls
ptl_model._optimizer._finish_bucket_grad_sync()
# else: Mcore distributed optim calls finalize_model_grads to finish grad sync
elif ptl_model.megatron_amp_O2:
# when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 or ptl_model.cfg.get("sequence_parallel", False):
if (
ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
or ptl_model.cfg.get("sequence_parallel", False)
or not ptl_model.cfg.get("async_grad_allreduce", True)
):
# main grads are stored in the MainParamsOptimizer wrapper
ptl_model._optimizer.allreduce_main_grads()
else:
# async grad allreduce is not currently implemented for O1/autocasting mixed precision training
# so we all-reduce gradients after the pipeline
ptl_model.allreduce_gradients() # @sangkug we think this is causing memory to blow up (hurts perf)
ptl_model.megatron_timer_stop("gradient_allreduce")

if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 and ptl_model.cfg.get(
"share_embeddings_and_output_weights", True
if (
not ptl_model.use_mcore_dist_optim
and ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
and ptl_model.cfg.get("share_embeddings_and_output_weights", True)
):
ptl_model.megatron_timer_start("allreduce_first_last_embeddings", log_level=1)
# when using pipeline parallelism the first and last stage must keep embeddings in sync
ptl_model.allreduce_first_last_embeddings()
ptl_model.megatron_timer_stop("allreduce_first_last_embeddings")


def prepare_for_validation_step(ptl_model):
Expand Down Expand Up @@ -155,14 +176,26 @@ def set_eval(ptl_model):
ptl_model.eval()


# TODO: adapt the version in /opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
def clip_gradients(ptl_model, clip_val):
"""PTL hook to configure gradients.
We use gradient clipping implementation from megatron-lm.
"""
if clip_val is None:
return

clip_val = float(clip_val)
if clip_val <= 0:
return

if ptl_model.with_megatron_fused_adam or ptl_model.use_mcore_dist_optim:
# Gradient clipping is done in optimizer step
return

if ptl_model.grad_clip_pl_default:
# use the default behavior
return super().configure_gradient_clipping(*args, **kwargs)

if ptl_model.with_distributed_adam:
grad_norm = clip_grad_norm_distributed_optimizer(ptl_model._optimizer, clip_val)
else:
Expand All @@ -171,6 +204,5 @@ def clip_gradients(ptl_model, clip_val):
parameters = ptl_model._optimizer.get_parameters_with_grad()
else:
parameters = ptl_model.get_parameters_with_grad()
grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val)

grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=ptl_model.use_fsdp,)
return grad_norm
53 changes: 33 additions & 20 deletions tests/functional/dpo.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
#!/bin/bash

# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
set -eoux pipefail

export NCCL_ALGO=Tree
export NVTE_APPLY_QK_LAYER_SCALING=1
export NVTE_APPLY_QK_LAYER_SCALING=${NVTE_APPLY_QK_LAYER_SCALING:-0}

KL=${KL:-0.1}
GBS=${GBS:-4}
PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}


Expand All @@ -23,7 +35,6 @@ mkdir -p $RESULTS_DIR

GPFS=$(git rev-parse --show-toplevel)

# START HETEROGENEUS JOB 3
CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
CONF_NAME="gpt_dpo"

Expand All @@ -33,38 +44,40 @@ dpo() {
export CUDA_VISIBLE_DEVICES=0,1
export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
export HYDRA_FULL_ERROR=1
torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
torchrun --nproc_per_node=2 ${GPFS}/examples/nlp/gpt/train_gpt_dpo.py \
--config-path=${CONF_DIR} \
--config-name=${CONF_NAME} \
trainer.num_nodes=1 \
trainer.devices=2 \
++model.data.data_impl=jsonl \
++model.data.seq_length=128 \
++model.global_batch_size=${GBS} \
pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
exp_manager.create_checkpoint_callback=False \
exp_manager.explicit_log_dir=${RESULTS_DIR} \
++model.tensor_model_parallel_size=1 \
++model.pipeline_model_parallel_size=1 \
++model.global_batch_size=4 \
++model.micro_batch_size=1 \
++model.mcore_gpt=true \
++model.megatron_amp_O2=true \
++model.dpo.ref_policy_kl_penalty=${KL} \
++model.dpo.ref_policy_kl_penalty=0.1 \
++model.dpo.log_prob_forward_micro_batch_size=1 \
++model.dpo.average_log_probs=false \
++model.dpo.sft_loss_weight=0.1 \
++model.dpo.preference_loss_weight=1.0 \
pretrained_checkpoint.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
"model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
exp_manager.create_checkpoint_callback=False \
++model.activations_checkpoint_granularity=full \
++model.activations_checkpoint_method=uniform \
++model.activations_checkpoint_num_layers=1 \
++model.dist_ckpt_load_strictness=log_all \
++model.data.data_impl=jsonl \
++model.data.seq_length=128 \
model.data.num_workers=2 \
++model.tensor_model_parallel_size=1 \
++model.pipeline_model_parallel_size=1 \
"model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \
trainer.dpo.max_steps=3 \
trainer.dpo.val_check_interval=3 \
trainer.dpo.limit_val_batches=8 \
trainer.dpo.save_interval=0 \
exp_manager.explicit_log_dir=${RESULTS_DIR} \
++model.activations_checkpoint_granularity=full \
++model.activations_checkpoint_method=uniform \
++model.activations_checkpoint_num_layers=1 \
++model.dist_ckpt_load_strictness=log_all
"$@"
}

log_file=$(mktemp /tmp/dpo-log-XXXXXX)
dpo | tee $log_file
dpo "$@" | tee $log_file
echo "[Finished] $0"
Loading

0 comments on commit eab96f2

Please sign in to comment.