Update generate_ddp_file for improved overrides (#2909)

Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
Laughing
2023-05-31 01:41:44 +08:00
committed by GitHub
parent facb7861cf
commit 305cde69d0
3 changed files with 9 additions and 8 deletions

View File

@ -182,7 +182,7 @@ class BaseTrainer:
# Command
cmd, file = generate_ddp_command(world_size, self)
try:
LOGGER.info(f'Running DDP command {cmd}')
LOGGER.info(f'DDP command: {cmd}')
subprocess.run(cmd, check=True)
except Exception as e:
raise e
@ -195,7 +195,7 @@ class BaseTrainer:
"""Initializes and sets the DistributedDataParallel parameters for training."""
torch.cuda.set_device(RANK)
self.device = torch.device('cuda', RANK)
LOGGER.info(f'DDP settings: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout
dist.init_process_group('nccl' if dist.is_nccl_available() else 'gloo',
timeout=timedelta(seconds=3600),