fix: fix DPO sequence packing + pipeline parallel (#437)

Signed-off-by: ashors1 <[email protected]>
NVIDIA · Dec 9, 2024 · 3791aad · 3791aad
1 parent cf14d1c
commit 3791aad
Showing 1 changed file with 2 additions and 0 deletions.
diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
@@ -111,6 +111,8 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                 required_keys.add("attention_mask")
                 if "cu_seqlens" in batch:
                     required_keys.add("cu_seqlens")
+                    required_keys.add("max_seqlen")
+                    required_keys.add("cu_seqlens_argmin")
 
                 if parallel_state.is_pipeline_first_stage():
                     if packed: