Add clearml logging (#51)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
single_channel
Ayush Chaurasia 2 years ago committed by GitHub
parent 512a225ce8
commit 298287298d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,10 +1,6 @@
""" """
Simple training loop; Boilerplate that could apply to any arbitrary neural network, Simple training loop; Boilerplate that could apply to any arbitrary neural network,
""" """
# TODOs
# 1. finish _set_model_attributes
# 2. allow num_class update for both pretrained and csv_loaded models
# 3. save
import os import os
import time import time
@ -24,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
from tqdm import tqdm from tqdm import tqdm
import ultralytics.yolo.utils as utils import ultralytics.yolo.utils as utils
import ultralytics.yolo.utils.loggers as loggers import ultralytics.yolo.utils.callbacks as callbacks
from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml
from ultralytics.yolo.utils import LOGGER, ROOT, TQDM_BAR_FORMAT from ultralytics.yolo.utils import LOGGER, ROOT, TQDM_BAR_FORMAT
from ultralytics.yolo.utils.checks import print_args from ultralytics.yolo.utils.checks import print_args
@ -73,8 +69,9 @@ class BaseTrainer:
self.fitness = None self.fitness = None
self.loss = None self.loss = None
for callback, func in loggers.default_callbacks.items(): for callback, func in callbacks.default_callbacks.items():
self.add_callback(callback, func) self.add_callback(callback, func)
callbacks.add_integration_callbacks(self)
def _get_config(self, config: Union[str, DictConfig], overrides: Union[str, Dict] = {}): def _get_config(self, config: Union[str, DictConfig], overrides: Union[str, Dict] = {}):
""" """
@ -146,7 +143,6 @@ class BaseTrainer:
self.test_loader = self.get_dataloader(self.testset, batch_size=self.args.batch_size * 2, rank=-1) self.test_loader = self.get_dataloader(self.testset, batch_size=self.args.batch_size * 2, rank=-1)
self.validator = self.get_validator() self.validator = self.get_validator()
print("created testloader :", rank) print("created testloader :", rank)
self.console.info(self.progress_string())
self.ema = ModelEMA(self.model) self.ema = ModelEMA(self.model)
def _do_train(self, rank=-1, world_size=1): def _do_train(self, rank=-1, world_size=1):
@ -155,7 +151,7 @@ class BaseTrainer:
else: else:
self.model = self.model.to(self.device) self.model = self.model.to(self.device)
# callback hook. before_train self.trigger_callbacks("before_train")
self._setup_train(rank) self._setup_train(rank)
self.epoch = 1 self.epoch = 1
@ -163,22 +159,22 @@ class BaseTrainer:
self.epoch_time_start = time.time() self.epoch_time_start = time.time()
self.train_time_start = time.time() self.train_time_start = time.time()
for epoch in range(self.args.epochs): for epoch in range(self.args.epochs):
# callback hook. on_epoch_start self.trigger_callbacks("on_epoch_start")
self.model.train() self.model.train()
pbar = enumerate(self.train_loader) pbar = enumerate(self.train_loader)
if rank in {-1, 0}: if rank in {-1, 0}:
pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader), bar_format=TQDM_BAR_FORMAT) pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader), bar_format=TQDM_BAR_FORMAT)
tloss = None self.tloss = None
for i, batch in pbar: for i, batch in pbar:
# img, label (classification)/ img, targets, paths, _, masks(detection) self.trigger_callbacks("on_batch_start")
# callback hook. on_batch_start
# forward # forward
batch = self.preprocess_batch(batch) batch = self.preprocess_batch(batch)
# TODO: warmup, multiscale # TODO: warmup, multiscale
preds = self.model(batch["img"]) preds = self.model(batch["img"])
self.loss, self.loss_items = self.criterion(preds, batch) self.loss, self.loss_items = self.criterion(preds, batch)
tloss = (tloss * i + self.loss_items) / (i + 1) if tloss is not None else self.loss_items self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
else self.loss_items
# backward # backward
self.model.zero_grad(set_to_none=True) self.model.zero_grad(set_to_none=True)
@ -186,28 +182,28 @@ class BaseTrainer:
# optimize # optimize
self.optimizer_step() self.optimizer_step()
self.trigger_callbacks('on_batch_end')
# log # log
mem = (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) mem = (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB)
loss_len = tloss.shape[0] if len(tloss.size()) else 1 loss_len = self.tloss.shape[0] if len(self.tloss.size()) else 1
losses = tloss if loss_len > 1 else torch.unsqueeze(tloss, 0) losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
if rank in {-1, 0}: if rank in {-1, 0}:
pbar.set_description( pbar.set_description(
(" {} " + "{:.3f} " * (1 + loss_len) + ' {} ').format(f'{epoch + 1}/{self.args.epochs}', mem, (" {} " + "{:.3f} " * (1 + loss_len) + ' {} ').format(f'{epoch + 1}/{self.args.epochs}', mem,
*losses, batch["img"].shape[-1])) *losses, batch["img"].shape[-1]))
self.trigger_callbacks('on_batch_end')
if rank in [-1, 0]: if rank in [-1, 0]:
# validation # validation
# callback: on_val_start() self.trigger_callbacks('on_val_start')
self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights']) self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights'])
self.validate() self.metrics, self.fitness = self.validate()
# callback: on_val_end() self.trigger_callbacks('on_val_end')
# save model # save model
if (not self.args.nosave) or (self.epoch + 1 == self.args.epochs): if (not self.args.nosave) or (self.epoch + 1 == self.args.epochs):
self.save_model() self.save_model()
# callback; on_model_save self.trigger_callbacks('on_model_save')
self.epoch += 1 self.epoch += 1
tnow = time.time() tnow = time.time()
@ -216,9 +212,8 @@ class BaseTrainer:
# TODO: termination condition # TODO: termination condition
self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours) \ self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)")
\n{self.usage_help()}") self.trigger_callbacks('on_train_end')
# callback; on_train_end
dist.destroy_process_group() if world_size != 1 else None dist.destroy_process_group() if world_size != 1 else None
def save_model(self): def save_model(self):
@ -238,12 +233,6 @@ class BaseTrainer:
torch.save(ckpt, self.best) torch.save(ckpt, self.best)
del ckpt del ckpt
def get_dataloader(self, dataset_path, batch_size=16, rank=0):
"""
Returns dataloader derived from torch.data.Dataloader
"""
pass
def get_dataset(self, data): def get_dataset(self, data):
""" """
Get train, val path from data dict if it exists. Returns None if data format is not recognized Get train, val path from data dict if it exists. Returns None if data format is not recognized
@ -259,12 +248,6 @@ class BaseTrainer:
weights=get_model(model) if pretrained else None, weights=get_model(model) if pretrained else None,
data=self.data) # model data=self.data) # model
def load_model(self, model_cfg, weights, data):
raise NotImplementedError("This task trainer doesn't support loading cfg files")
def get_validator(self):
pass
def optimizer_step(self): def optimizer_step(self):
self.scaler.unscale_(self.optimizer) # unscale gradients self.scaler.unscale_(self.optimizer) # unscale gradients
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients
@ -286,48 +269,55 @@ class BaseTrainer:
# TODO: discuss validator class. Enforce that a validator metrics dict should contain # TODO: discuss validator class. Enforce that a validator metrics dict should contain
"fitness" metric. "fitness" metric.
""" """
self.metrics = self.validator(self) metrics = self.validator(self)
self.fitness = self.metrics.get("fitness", fitness = metrics.get("fitness", -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found
-self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found if not self.best_fitness or self.best_fitness < fitness:
if not self.best_fitness or self.best_fitness < self.fitness:
self.best_fitness = self.fitness self.best_fitness = self.fitness
return metrics, fitness
def set_model_attributes(self): def log(self, text, rank=-1):
""" """
To set or update model parameters before training. Logs the given text to given ranks process if provided, otherwise logs to all ranks
:param text: text to log
:param rank: List[Int]
""" """
pass if rank in {-1, 0}:
self.console.info(text)
def build_targets(self, preds, targets): def load_model(self, model_cfg, weights, data):
pass raise NotImplementedError("This task trainer doesn't support loading cfg files")
def get_validator(self):
raise NotImplementedError("get_validator function not implemented in trainer")
def get_dataloader(self, dataset_path, batch_size=16, rank=0):
"""
Returns dataloader derived from torch.data.Dataloader
"""
raise NotImplementedError("get_dataloader function not implemented in trainer")
def criterion(self, preds, batch): def criterion(self, preds, batch):
""" """
Returns loss and individual loss items as Tensor Returns loss and individual loss items as Tensor
""" """
pass raise NotImplementedError("criterion function not implemented in trainer")
def progress_string(self): def label_loss_items(self, loss_items):
""" """
Returns progress string depending on task type. Returns a loss dict with labelled training loss items tensor
""" """
return '' # Not needed for classification but necessary for segmentation & detection
return {"loss": loss_items}
def usage_help(self): def set_model_attributes(self):
""" """
Returns usage functionality. gets printed to the console after training. To set or update model parameters before training.
""" """
pass pass
def log(self, text, rank=-1): def build_targets(self, preds, targets):
""" pass
Logs the given text to given ranks process if provided, otherwise logs to all ranks
:param text: text to log
:param rank: List[Int]
"""
if rank in {-1, 0}:
self.console.info(text)
def build_optimizer(model, name='Adam', lr=0.001, momentum=0.9, decay=1e-5): def build_optimizer(model, name='Adam', lr=0.001, momentum=0.9, decay=1e-5):

@ -24,6 +24,7 @@ class BaseValidator:
self.cuda = self.device.type != 'cpu' self.cuda = self.device.type != 'cpu'
self.batch_i = None self.batch_i = None
self.training = True self.training = True
self.loss = None
def __call__(self, trainer=None, model=None): def __call__(self, trainer=None, model=None):
""" """
@ -44,7 +45,7 @@ class BaseValidator:
model.eval() model.eval()
dt = Profile(), Profile(), Profile(), Profile() dt = Profile(), Profile(), Profile(), Profile()
loss = 0 self.loss = 0
n_batches = len(self.dataloader) n_batches = len(self.dataloader)
desc = self.get_desc() desc = self.get_desc()
bar = tqdm(self.dataloader, desc, n_batches, not self.training, bar_format=TQDM_BAR_FORMAT) bar = tqdm(self.dataloader, desc, n_batches, not self.training, bar_format=TQDM_BAR_FORMAT)
@ -65,7 +66,7 @@ class BaseValidator:
# loss # loss
with dt[2]: with dt[2]:
if self.training: if self.training:
loss += trainer.criterion(preds, batch)[0] self.loss += trainer.criterion(preds, batch)[0]
# pre-process predictions # pre-process predictions
with dt[3]: with dt[3]:

@ -0,0 +1 @@
from .base import add_integration_callbacks, default_callbacks

@ -30,3 +30,12 @@ default_callbacks = {
"on_val_start": on_val_start, "on_val_start": on_val_start,
"on_val_end": on_val_end, "on_val_end": on_val_end,
"on_model_save": on_model_save} "on_model_save": on_model_save}
def add_integration_callbacks(trainer):
callbacks = {}
from .clearml import callbacks, clearml
if clearml:
for callback, func in callbacks.items():
trainer.add_callback(callback, func)

@ -0,0 +1,45 @@
try:
import clearml
from clearml import Task
assert hasattr(clearml, '__version__')
except (ImportError, AssertionError):
clearml = None
def _log_scalers(metric_dict, group="", step=0):
task = Task.current_task()
if task:
for k, v in metric_dict.items():
task.get_logger().report_scalar(group, k, v, step)
def before_train(trainer):
# TODO: reuse existing task
task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv5',
task_name=trainer.args.name if trainer.args.name != 'exp' else 'Training',
tags=['YOLOv5'],
output_uri=True,
reuse_last_task_id=False,
auto_connect_frameworks={'pytorch': False})
task.connect(trainer.args, name='parameters')
def on_batch_end(trainer):
train_loss = trainer.tloss
_log_scalers(trainer.label_loss_items(train_loss), "train", trainer.epoch)
def on_val_end(trainer):
metrics = trainer.metrics
val_losses = trainer.validator.loss
val_loss_dict = trainer.label_loss_items(val_losses)
_log_scalers(val_loss_dict, "val", trainer.epoch)
_log_scalers(metrics, "metrics", trainer.epoch)
callbacks = {
"before_train": before_train,
"on_val_end": on_val_end,
"on_batch_end": on_batch_end,}

@ -1 +0,0 @@
from .base import default_callbacks

@ -234,6 +234,11 @@ class SegmentationTrainer(BaseTrainer):
loss = lbox + lobj + lcls + lseg loss = lbox + lobj + lcls + lseg
return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach() return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach()
def label_loss_items(self, loss_items):
# We should just use named tensors here in future
keys = ["lbox", "lseg", "lobj", "lcls"]
return dict(zip(keys, loss_items))
def progress_string(self): def progress_string(self):
return ('\n' + '%11s' * 7) % \ return ('\n' + '%11s' * 7) % \
('Epoch', 'GPU_mem', 'box_loss', 'seg_loss', 'obj_loss', 'cls_loss', 'Size') ('Epoch', 'GPU_mem', 'box_loss', 'seg_loss', 'obj_loss', 'cls_loss', 'Size')

Loading…
Cancel
Save