do all reduce on cpu

PrimeIntellect-ai · Sep 11, 2024 · 04dd585 · 04dd585
1 parent 5b701d7
commit 04dd585
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/open_diloco/train_pure_fsdp.py b/open_diloco/train_pure_fsdp.py
@@ -234,7 +234,7 @@ def train(config: Config):
             # todo check how to handle the SHARD_GRAD_OP strategy where the weight are replicated across the local devices
             param_offloaded.grad = param_offloaded.data - param.data.to(param_offloaded.device)
 
-            if param_offloaded.grad.device != torch.device("cpu"):
+            if param_offloaded.grad.device == torch.device("cpu"):
                 # gloo does not support AVG
                 param_offloaded.grad = param_offloaded.grad / global_pg.size()
                 dist.all_reduce(param_offloaded.grad, op=dist.ReduceOp.SUM, group=global_pg)