|
|
@ -5,6 +5,7 @@ Simple training loop; Boilerplate that could apply to any arbitrary neural netwo
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
import subprocess
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import time
|
|
|
|
from collections import defaultdict
|
|
|
|
from collections import defaultdict
|
|
|
|
from copy import deepcopy
|
|
|
|
from copy import deepcopy
|
|
|
@ -28,10 +29,10 @@ from ultralytics.yolo.utils import (DEFAULT_CFG, LOGGER, RANK, SETTINGS, TQDM_BA
|
|
|
|
yaml_save)
|
|
|
|
yaml_save)
|
|
|
|
from ultralytics.yolo.utils.autobatch import check_train_batch_size
|
|
|
|
from ultralytics.yolo.utils.autobatch import check_train_batch_size
|
|
|
|
from ultralytics.yolo.utils.checks import check_file, check_imgsz, print_args
|
|
|
|
from ultralytics.yolo.utils.checks import check_file, check_imgsz, print_args
|
|
|
|
from ultralytics.yolo.utils.dist import ddp_cleanup, generate_ddp_command
|
|
|
|
from ultralytics.yolo.utils.dist import ddp_cleanup, generate_ddp_file, find_free_network_port
|
|
|
|
from ultralytics.yolo.utils.files import get_latest_run, increment_path
|
|
|
|
from ultralytics.yolo.utils.files import get_latest_run, increment_path
|
|
|
|
from ultralytics.yolo.utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, init_seeds, one_cycle,
|
|
|
|
from ultralytics.yolo.utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, init_seeds, one_cycle,
|
|
|
|
select_device, strip_optimizer)
|
|
|
|
select_device, strip_optimizer, TORCH_1_9)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseTrainer:
|
|
|
|
class BaseTrainer:
|
|
|
@ -174,13 +175,18 @@ class BaseTrainer:
|
|
|
|
|
|
|
|
|
|
|
|
# Run subprocess if DDP training, else train normally
|
|
|
|
# Run subprocess if DDP training, else train normally
|
|
|
|
if world_size > 1 and "LOCAL_RANK" not in os.environ:
|
|
|
|
if world_size > 1 and "LOCAL_RANK" not in os.environ:
|
|
|
|
command = generate_ddp_command(world_size, self)
|
|
|
|
# cmd, file = generate_ddp_command(world_size, self) # security vulnerability in Snyk scans
|
|
|
|
|
|
|
|
file = generate_ddp_file(self) if sys.argv[0].endswith('yolo') else os.path.abspath(sys.argv[0])
|
|
|
|
|
|
|
|
torch_distributed_cmd = "torch.distributed.run" if TORCH_1_9 else "torch.distributed.launch"
|
|
|
|
|
|
|
|
cmd = [
|
|
|
|
|
|
|
|
sys.executable, "-m", torch_distributed_cmd, "--nproc_per_node", f"{world_size}", "--master_port",
|
|
|
|
|
|
|
|
f"{find_free_network_port()}", file] + sys.argv[1:]
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
subprocess.run(command)
|
|
|
|
subprocess.run(cmd, check=True)
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
self.console(e)
|
|
|
|
self.console.warning(e)
|
|
|
|
finally:
|
|
|
|
finally:
|
|
|
|
ddp_cleanup(command, self)
|
|
|
|
ddp_cleanup(self, file)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
self._do_train(int(os.getenv("RANK", -1)), world_size)
|
|
|
|
self._do_train(int(os.getenv("RANK", -1)), world_size)
|
|
|
|
|
|
|
|
|
|
|
|