You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
2024-11-12 23:43:20.275 | ERROR | yolox.core.launch:_distributed_worker:147 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (2778588), thread 'MainThread' (140456832386880):
Traceback (most recent call last):
File "", line 1, in
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
exitcode = _main(fd, parent_sentinel)
│ │ └ 3
│ └ 7
└ <function _main at 0x7fbea73f2f20>
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 135, in _main
return self._bootstrap(parent_sentinel)
│ │ └ 3
│ └ <function BaseProcess._bootstrap at 0x7fbea74eb4c0>
└
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
│ └ <function BaseProcess.run at 0x7fbea74eaa20>
└
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
│ │ │ │ │ └ {}
│ │ │ │ └
│ │ │ └ (<function _distributed_worker at 0x7fbdeee05440>, 0, (<function main at 0x7fbdd600b880>, 6, 6, 0, 'nccl', 'tcp://127.0.0.1:4...
│ │ └
│ └ <function _wrap at 0x7fbdeffee160>
└
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/multiprocessing/spawn.py", line 76, in _wrap
fn(i, *args)
│ │ └ (<function main at 0x7fbdd600b880>, 6, 6, 0, 'nccl', 'tcp://127.0.0.1:48377', (╒═══════════════════╤═════════════════════════...
│ └ 0
└ <function _distributed_worker at 0x7fbdeee05440>
File "/home/icehan/project/YOLOX/yolox/core/launch.py", line 147, in _distributed_worker
main_func(*args)
│ └ (╒═══════════════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════════...
└ <function main at 0x7fbdd600b880>
File "/home/icehan/project/YOLOX/tools/train.py", line 118, in main
trainer.train()
│ └ <function Trainer.train at 0x7fbdd5e73d80>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/core/trainer.py", line 77, in train
self.train_in_epoch()
│ └ <function Trainer.train_in_epoch at 0x7fbdd5e81120>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/core/trainer.py", line 87, in train_in_epoch
self.train_in_iter()
│ └ <function Trainer.train_in_iter at 0x7fbdd5ef0ea0>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/core/trainer.py", line 93, in train_in_iter
self.train_one_iter()
│ └ <function Trainer.train_one_iter at 0x7fbdd5ef0f40>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/models/yolo_head.py", line 551, in simota_matching
_, pos_idx = torch.topk(
│ │ └ <built-in method topk of type object at 0x7fbe9d1935e0>
│ └ <module 'torch' from '/home/icehan/anaconda3/lib/python3.12/site-packages/torch/init.py'>
└
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
The text was updated successfully, but these errors were encountered:
2024-11-12 23:43:20.275 | ERROR | yolox.core.launch:_distributed_worker:147 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (2778588), thread 'MainThread' (140456832386880):
Traceback (most recent call last):
File "", line 1, in
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
exitcode = _main(fd, parent_sentinel)
│ │ └ 3
│ └ 7
└ <function _main at 0x7fbea73f2f20>
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 135, in _main
return self._bootstrap(parent_sentinel)
│ │ └ 3
│ └ <function BaseProcess._bootstrap at 0x7fbea74eb4c0>
└
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
│ └ <function BaseProcess.run at 0x7fbea74eaa20>
└
File "/home/icehan/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
│ │ │ │ │ └ {}
│ │ │ │ └
│ │ │ └ (<function _distributed_worker at 0x7fbdeee05440>, 0, (<function main at 0x7fbdd600b880>, 6, 6, 0, 'nccl', 'tcp://127.0.0.1:4...
│ │ └
│ └ <function _wrap at 0x7fbdeffee160>
└
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/multiprocessing/spawn.py", line 76, in _wrap
fn(i, *args)
│ │ └ (<function main at 0x7fbdd600b880>, 6, 6, 0, 'nccl', 'tcp://127.0.0.1:48377', (╒═══════════════════╤═════════════════════════...
│ └ 0
└ <function _distributed_worker at 0x7fbdeee05440>
File "/home/icehan/project/YOLOX/tools/train.py", line 118, in main
trainer.train()
│ └ <function Trainer.train at 0x7fbdd5e73d80>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/core/trainer.py", line 77, in train
self.train_in_epoch()
│ └ <function Trainer.train_in_epoch at 0x7fbdd5e81120>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/core/trainer.py", line 87, in train_in_epoch
self.train_in_iter()
│ └ <function Trainer.train_in_iter at 0x7fbdd5ef0ea0>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/core/trainer.py", line 93, in train_in_iter
self.train_one_iter()
│ └ <function Trainer.train_one_iter at 0x7fbdd5ef0f40>
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/project/YOLOX/yolox/core/trainer.py", line 107, in train_one_iter
outputs = self.model(inps, targets)
│ │ │ └
│ │ └
│ └ DistributedDataParallel(
│ (module): YOLOX(
│ (backbone): YOLOPAFPN(
│ (backbone): CSPDarknet(
│ (stem): Focus(
│ ...
└ <yolox.core.trainer.Trainer object at 0x7fbdd5f57860>
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
│ │ │ └ {}
│ │ └
│ └ <function Module._call_impl at 0x7fbdf0492e80>
└ DistributedDataParallel(
(module): YOLOX(
(backbone): YOLOPAFPN(
(backbone): CSPDarknet(
(stem): Focus(
...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
│ │ └ {}
│ └
└ <bound method DistributedDataParallel.forward of DistributedDataParallel(
(module): YOLOX(
(backbone): YOLOPAFPN(
...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1636, in forward
else self._run_ddp_forward(*inputs, **kwargs)
│ │ │ └ {}
│ │ └
│ └ <function DistributedDataParallel._run_ddp_forward at 0x7fbdf00b4040>
└ DistributedDataParallel(
(module): YOLOX(
(backbone): YOLOPAFPN(
(backbone): CSPDarknet(
(stem): Focus(
...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 1454, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
│ │ └ {}
│ └
└ DistributedDataParallel(
(module): YOLOX(
(backbone): YOLOPAFPN(
(backbone): CSPDarknet(
(stem): Focus(
...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
│ │ │ └ {}
│ │ └
│ └ <function Module._call_impl at 0x7fbdf0492e80>
└ YOLOX(
(backbone): YOLOPAFPN(
(backbone): CSPDarknet(
(stem): Focus(
(conv): BaseConv(
(conv): ...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
│ │ └ {}
│ └
└ <bound method YOLOX.forward of YOLOX(
(backbone): YOLOPAFPN(
(backbone): CSPDarknet(
(stem): Focus(
(conv...
File "/home/icehan/project/YOLOX/yolox/models/yolox.py", line 34, in forward
loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
└ YOLOX(
(backbone): YOLOPAFPN(
(backbone): CSPDarknet(
(stem): Focus(
(conv): BaseConv(
(conv): ...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
│ │ │ └ {}
│ │ └
│ └ <function Module._call_impl at 0x7fbdf0492e80>
└ YOLOXHead(
(cls_convs): ModuleList(
(0-2): 3 x Sequential(
(0): BaseConv(
(conv): Conv2d(256, 256, kernel...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
│ │ └ {}
│ └
└ <bound method YOLOXHead.forward of YOLOXHead(
(cls_convs): ModuleList(
(0-2): 3 x Sequential(
(0): BaseConv(
...
File "/home/icehan/project/YOLOX/yolox/models/yolo_head.py", line 194, in forward
return self.get_losses(
│ └ <function YOLOXHead.get_losses at 0x7fbdc00bc220>
└ YOLOXHead(
(cls_convs): ModuleList(
(0-2): 3 x Sequential(
(0): BaseConv(
(conv): Conv2d(256, 256, kernel...
File "/home/icehan/project/YOLOX/yolox/models/yolo_head.py", line 310, in get_losses
) = self.get_assignments( # noqa
│ └ <function YOLOXHead.get_assignments at 0x7fbdc00bc400>
└ YOLOXHead(
(cls_convs): ModuleList(
(0-2): 3 x Sequential(
(0): BaseConv(
(conv): Conv2d(256, 256, kernel...
File "/home/icehan/anaconda3/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
│ │ └ {}
│ └
└ <function YOLOXHead.get_assignments at 0x7fbdc00bc360>
File "/home/icehan/project/YOLOX/yolox/models/yolo_head.py", line 496, in get_assignments
) = self.simota_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
│ │ │ │ │ │ └
│ │ │ │ │ └ 6
│ │ │ │ └
│ │ │ └
│ │ └
│ └ <function YOLOXHead.simota_matching at 0x7fbdc00bc540>
└ YOLOXHead(
(cls_convs): ModuleList(
(0-2): 3 x Sequential(
(0): BaseConv(
(conv): Conv2d(256, 256, kernel...
File "/home/icehan/project/YOLOX/yolox/models/yolo_head.py", line 551, in simota_matching
_, pos_idx = torch.topk(
│ │ └ <built-in method topk of type object at 0x7fbe9d1935e0>
│ └ <module 'torch' from '/home/icehan/anaconda3/lib/python3.12/site-packages/torch/init.py'>
└
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.The text was updated successfully, but these errors were encountered: