Increase NCCL timeout from 1 hour to 3 hours (#3343)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
single_channel
Jieun Park 1 year ago committed by GitHub
parent 137552996a
commit 2ebd808b69
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -197,10 +197,11 @@ class BaseTrainer:
self.device = torch.device('cuda', RANK) self.device = torch.device('cuda', RANK)
LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
dist.init_process_group('nccl' if dist.is_nccl_available() else 'gloo', dist.init_process_group(
timeout=timedelta(seconds=3600), 'nccl' if dist.is_nccl_available() else 'gloo',
rank=RANK, timeout=timedelta(seconds=10800), # 3 hours
world_size=world_size) rank=RANK,
world_size=world_size)
def _setup_train(self, world_size): def _setup_train(self, world_size):
""" """

Loading…
Cancel
Save