ultralytics 8.0.99
HUB resume fix and Docs updates (#2567)
Co-authored-by: Ayush Chaurasia <ayush.chaurarsia@gmail.com> Co-authored-by: Yonghye Kwon <developer.0hye@gmail.com>
This commit is contained in:
@ -190,6 +190,17 @@ class BaseTrainer:
|
||||
else:
|
||||
self._do_train(world_size)
|
||||
|
||||
def _pre_caching_dataset(self):
|
||||
"""
|
||||
Caching dataset before training to avoid NCCL timeout.
|
||||
Must be done before DDP initialization.
|
||||
See https://github.com/ultralytics/ultralytics/pull/2549 for details.
|
||||
"""
|
||||
if RANK in (-1, 0):
|
||||
LOGGER.info('Pre-caching dataset to avoid NCCL timeout')
|
||||
self.get_dataloader(self.trainset, batch_size=1, rank=RANK, mode='train')
|
||||
self.get_dataloader(self.testset, batch_size=1, rank=-1, mode='val')
|
||||
|
||||
def _setup_ddp(self, world_size):
|
||||
"""Initializes and sets the DistributedDataParallel parameters for training."""
|
||||
torch.cuda.set_device(RANK)
|
||||
@ -263,6 +274,7 @@ class BaseTrainer:
|
||||
def _do_train(self, world_size=1):
|
||||
"""Train completed, evaluate and plot if specified by arguments."""
|
||||
if world_size > 1:
|
||||
self._pre_caching_dataset()
|
||||
self._setup_ddp(world_size)
|
||||
|
||||
self._setup_train(world_size)
|
||||
@ -549,10 +561,15 @@ class BaseTrainer:
|
||||
resume = self.args.resume
|
||||
if resume:
|
||||
try:
|
||||
last = Path(
|
||||
check_file(resume) if isinstance(resume, (str,
|
||||
Path)) and Path(resume).exists() else get_latest_run())
|
||||
self.args = get_cfg(attempt_load_weights(last).args)
|
||||
exists = isinstance(resume, (str, Path)) and Path(resume).exists()
|
||||
last = Path(check_file(resume) if exists else get_latest_run())
|
||||
|
||||
# Check that resume data YAML exists, otherwise strip to force re-download of dataset
|
||||
ckpt_args = attempt_load_weights(last).args
|
||||
if not Path(ckpt_args['data']).exists():
|
||||
ckpt_args['data'] = self.args.data
|
||||
|
||||
self.args = get_cfg(ckpt_args)
|
||||
self.args.model, resume = str(last), True # reinstate
|
||||
except Exception as e:
|
||||
raise FileNotFoundError('Resume checkpoint not found. Please pass a valid checkpoint to resume from, '
|
||||
|
Reference in New Issue
Block a user