You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Thanks for your great work. When I try to train the model with eight 3090 GPUs with the following commands, ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp --ref 32
The following errors are encountered.
Starting training from iteration 0
Exception during training:
Traceback (most recent call last):
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/train_loop.py", line 149, in train
self.run_step()
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 297, in run_step
grad_norm = self.grad_scaler(
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 207, in __call__
self._scaler.scale(loss).backward(create_graph=create_graph)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/_tensor.py", line 488, in backward
torch.autograd.backward(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/ldm/modules/diffusionmodules/util.py", line 142, in backward
input_grads = torch.autograd.grad(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 300, in grad
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autogr
[06/17 03:38:05 d2.engine.hooks]: Total training time: 0:00:25 (0:00:00 on hooks)
[06/17 03:38:05 d2.utils.events]: odise_label_coco_50e_bs16x8/default iter: 0/368752 lr: N/A max_mem: 19297M
Traceback (most recent call last):
File "/home/zoloz/8T-1/zitong/code/ODISE/./tools/train_net.py", line 392, in <module>
launch(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/launch.py", line 67, in launch
mp.spawn(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/launch.py", line 126, in _distributed_worker
main_func(*args)
File "/home/zoloz/8T-1/zitong/code/ODISE/tools/train_net.py", line 363, in main
do_train(args, cfg)
File "/home/zoloz/8T-1/zitong/code/ODISE/tools/train_net.py", line 309, in do_train
trainer.train(start_iter, cfg.train.max_iter)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/detectron2/engine/train_loop.py", line 149, in train
self.run_step()
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 297, in run_step
grad_norm = self.grad_scaler(
File "/home/zoloz/8T-1/zitong/code/ODISE/odise/engine/train_loop.py", line 207, in __call__
self._scaler.scale(loss).backward(create_graph=create_graph)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/_tensor.py", line 488, in backward
torch.autograd.backward(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/ldm/modules/diffusionmodules/util.py", line 142, in backward
input_grads = torch.autograd.grad(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/__init__.py", line 300, in grad
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 267, in apply
return user_fn(self, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/autograd/function.py", line 414, in wrapper
outputs = fn(ctx, *args)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/xformers/ops/fmha/__init__.py", line 111, in backward
grads = _memory_efficient_attention_backward(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/xformers/ops/fmha/__init__.py", line 382, in _memory_efficient_attention_backward
grads = op.apply(ctx, inp, grad)
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/xformers/ops/fmha/cutlass.py", line 184, in apply
(grad_q, grad_k, grad_v,) = cls.OPERATOR(
File "/home/dazhi/miniconda3/envs/odise/lib/python3.9/site-packages/torch/_ops.py", line 442, in __call__
return self._op(*args, **kwargs or {})
RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
The text was updated successfully, but these errors were encountered:
Thanks for your great work. When I try to train the model with eight 3090 GPUs with the following commands,
./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp --ref 32
The following errors are encountered.
The text was updated successfully, but these errors were encountered: