From 2ebd808b690b6695d9366a532714cd799c83bb19 Mon Sep 17 00:00:00 2001 From: Jieun Park <103101268+rumjie@users.noreply.github.com> Date: Sat, 24 Jun 2023 22:56:18 +0900 Subject: [PATCH] Increase NCCL timeout from 1 hour to 3 hours (#3343) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher --- ultralytics/yolo/engine/trainer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ultralytics/yolo/engine/trainer.py b/ultralytics/yolo/engine/trainer.py index e4925b3..144be9c 100644 --- a/ultralytics/yolo/engine/trainer.py +++ b/ultralytics/yolo/engine/trainer.py @@ -197,10 +197,11 @@ class BaseTrainer: self.device = torch.device('cuda', RANK) LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout - dist.init_process_group('nccl' if dist.is_nccl_available() else 'gloo', - timeout=timedelta(seconds=3600), - rank=RANK, - world_size=world_size) + dist.init_process_group( + 'nccl' if dist.is_nccl_available() else 'gloo', + timeout=timedelta(seconds=10800), # 3 hours + rank=RANK, + world_size=world_size) def _setup_train(self, world_size): """