diff --git a/ultralytics/yolo/engine/trainer.py b/ultralytics/yolo/engine/trainer.py index e4925b3..144be9c 100644 --- a/ultralytics/yolo/engine/trainer.py +++ b/ultralytics/yolo/engine/trainer.py @@ -197,10 +197,11 @@ class BaseTrainer: self.device = torch.device('cuda', RANK) LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout - dist.init_process_group('nccl' if dist.is_nccl_available() else 'gloo', - timeout=timedelta(seconds=3600), - rank=RANK, - world_size=world_size) + dist.init_process_group( + 'nccl' if dist.is_nccl_available() else 'gloo', + timeout=timedelta(seconds=10800), # 3 hours + rank=RANK, + world_size=world_size) def _setup_train(self, world_size): """