Add clearml logging (#51)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2022-11-24 19:55:03 +05:30
parent 512a225ce8
commit 298287298d
7 changed files with 119 additions and 69 deletions
--- a/ultralytics/yolo/engine/trainer.py
+++ b/ultralytics/yolo/engine/trainer.py
@ -1,10 +1,6 @@
 """
 Simple training loop; Boilerplate that could apply to any arbitrary neural network,
 """
-# TODOs
-# 1. finish _set_model_attributes
-# 2. allow num_class update for both pretrained and csv_loaded models
-# 3. save

 import os
 import time
@ -24,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from tqdm import tqdm

 import ultralytics.yolo.utils as utils
-import ultralytics.yolo.utils.loggers as loggers
+import ultralytics.yolo.utils.callbacks as callbacks
 from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml
 from ultralytics.yolo.utils import LOGGER, ROOT, TQDM_BAR_FORMAT
 from ultralytics.yolo.utils.checks import print_args
@ -73,8 +69,9 @@ class BaseTrainer:
        self.fitness = None
        self.loss = None

-        for callback, func in loggers.default_callbacks.items():
+        for callback, func in callbacks.default_callbacks.items():
            self.add_callback(callback, func)
+        callbacks.add_integration_callbacks(self)

    def _get_config(self, config: Union[str, DictConfig], overrides: Union[str, Dict] = {}):
        """
@ -146,7 +143,6 @@ class BaseTrainer:
            self.test_loader = self.get_dataloader(self.testset, batch_size=self.args.batch_size * 2, rank=-1)
            self.validator = self.get_validator()
            print("created testloader :", rank)
-            self.console.info(self.progress_string())
            self.ema = ModelEMA(self.model)

    def _do_train(self, rank=-1, world_size=1):
@ -155,7 +151,7 @@ class BaseTrainer:
        else:
            self.model = self.model.to(self.device)

-        # callback hook. before_train
+        self.trigger_callbacks("before_train")
        self._setup_train(rank)

        self.epoch = 1
@ -163,22 +159,22 @@ class BaseTrainer:
        self.epoch_time_start = time.time()
        self.train_time_start = time.time()
        for epoch in range(self.args.epochs):
-            # callback hook. on_epoch_start
+            self.trigger_callbacks("on_epoch_start")
            self.model.train()
            pbar = enumerate(self.train_loader)
            if rank in {-1, 0}:
                pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader), bar_format=TQDM_BAR_FORMAT)
-            tloss = None
+            self.tloss = None
            for i, batch in pbar:
-                # img, label (classification)/ img, targets, paths, _, masks(detection)
-                # callback hook. on_batch_start
+                self.trigger_callbacks("on_batch_start")
                # forward
                batch = self.preprocess_batch(batch)

                # TODO: warmup, multiscale
                preds = self.model(batch["img"])
                self.loss, self.loss_items = self.criterion(preds, batch)
-                tloss = (tloss * i + self.loss_items) / (i + 1) if tloss is not None else self.loss_items
+                self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
+                                else self.loss_items

                # backward
                self.model.zero_grad(set_to_none=True)
@ -186,28 +182,28 @@ class BaseTrainer:

                # optimize
                self.optimizer_step()
-                self.trigger_callbacks('on_batch_end')

                # log
                mem = (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
-                loss_len = tloss.shape[0] if len(tloss.size()) else 1
-                losses = tloss if loss_len > 1 else torch.unsqueeze(tloss, 0)
+                loss_len = self.tloss.shape[0] if len(self.tloss.size()) else 1
+                losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
                if rank in {-1, 0}:
                    pbar.set_description(
                        (" {} " + "{:.3f}  " * (1 + loss_len) + ' {} ').format(f'{epoch + 1}/{self.args.epochs}', mem,
                                                                               *losses, batch["img"].shape[-1]))
+                    self.trigger_callbacks('on_batch_end')

            if rank in [-1, 0]:
                # validation
-                # callback: on_val_start()
+                self.trigger_callbacks('on_val_start')
                self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights'])
-                self.validate()
-                # callback: on_val_end()
+                self.metrics, self.fitness = self.validate()
+                self.trigger_callbacks('on_val_end')

                # save model
                if (not self.args.nosave) or (self.epoch + 1 == self.args.epochs):
                    self.save_model()
-                    # callback; on_model_save
+                    self.trigger_callbacks('on_model_save')

            self.epoch += 1
            tnow = time.time()
@ -216,9 +212,8 @@ class BaseTrainer:

            # TODO: termination condition

-        self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours) \
-                            \n{self.usage_help()}")
-        # callback; on_train_end
+        self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)")
+        self.trigger_callbacks('on_train_end')
        dist.destroy_process_group() if world_size != 1 else None

    def save_model(self):
@ -238,12 +233,6 @@ class BaseTrainer:
            torch.save(ckpt, self.best)
        del ckpt

-    def get_dataloader(self, dataset_path, batch_size=16, rank=0):
-        """
-        Returns dataloader derived from torch.data.Dataloader
-        """
-        pass
-
    def get_dataset(self, data):
        """
        Get train, val path from data dict if it exists. Returns None if data format is not recognized
@ -259,12 +248,6 @@ class BaseTrainer:
                               weights=get_model(model) if pretrained else None,
                               data=self.data)  # model

-    def load_model(self, model_cfg, weights, data):
-        raise NotImplementedError("This task trainer doesn't support loading cfg files")
-
-    def get_validator(self):
-        pass
-
    def optimizer_step(self):
        self.scaler.unscale_(self.optimizer)  # unscale gradients
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)  # clip gradients
@ -286,38 +269,11 @@ class BaseTrainer:
        # TODO: discuss validator class. Enforce that a validator metrics dict should contain
        "fitness" metric.
        """
-        self.metrics = self.validator(self)
-        self.fitness = self.metrics.get("fitness",
-                                        -self.loss.detach().cpu().numpy())  # use loss as fitness measure if not found
-        if not self.best_fitness or self.best_fitness < self.fitness:
+        metrics = self.validator(self)
+        fitness = metrics.get("fitness", -self.loss.detach().cpu().numpy())  # use loss as fitness measure if not found
+        if not self.best_fitness or self.best_fitness < fitness:
            self.best_fitness = self.fitness
-
-    def set_model_attributes(self):
-        """
-        To set or update model parameters before training.
-        """
-        pass
-
-    def build_targets(self, preds, targets):
-        pass
-
-    def criterion(self, preds, batch):
-        """
-        Returns loss and individual loss items as Tensor
-        """
-        pass
-
-    def progress_string(self):
-        """
-        Returns progress string depending on task type.
-        """
-        return ''
-
-    def usage_help(self):
-        """
-        Returns usage functionality. gets printed to the console after training.
-        """
-        pass
+        return metrics, fitness

    def log(self, text, rank=-1):
        """
@ -329,6 +285,40 @@ class BaseTrainer:
        if rank in {-1, 0}:
            self.console.info(text)

+    def load_model(self, model_cfg, weights, data):
+        raise NotImplementedError("This task trainer doesn't support loading cfg files")
+
+    def get_validator(self):
+        raise NotImplementedError("get_validator function not implemented in trainer")
+
+    def get_dataloader(self, dataset_path, batch_size=16, rank=0):
+        """
+        Returns dataloader derived from torch.data.Dataloader
+        """
+        raise NotImplementedError("get_dataloader function not implemented in trainer")
+
+    def criterion(self, preds, batch):
+        """
+        Returns loss and individual loss items as Tensor
+        """
+        raise NotImplementedError("criterion function not implemented in trainer")
+
+    def label_loss_items(self, loss_items):
+        """
+        Returns a loss dict with labelled training loss items tensor
+        """
+        # Not needed for classification but necessary for segmentation & detection
+        return {"loss": loss_items}
+
+    def set_model_attributes(self):
+        """
+        To set or update model parameters before training.
+        """
+        pass
+
+    def build_targets(self, preds, targets):
+        pass
+

 def build_optimizer(model, name='Adam', lr=0.001, momentum=0.9, decay=1e-5):
    # TODO: 1. docstring with example? 2. Move this inside Trainer? or utils?
--- a/ultralytics/yolo/engine/validator.py
+++ b/ultralytics/yolo/engine/validator.py
@ -24,6 +24,7 @@ class BaseValidator:
        self.cuda = self.device.type != 'cpu'
        self.batch_i = None
        self.training = True
+        self.loss = None

    def __call__(self, trainer=None, model=None):
        """
@ -44,7 +45,7 @@ class BaseValidator:

        model.eval()
        dt = Profile(), Profile(), Profile(), Profile()
-        loss = 0
+        self.loss = 0
        n_batches = len(self.dataloader)
        desc = self.get_desc()
        bar = tqdm(self.dataloader, desc, n_batches, not self.training, bar_format=TQDM_BAR_FORMAT)
@ -65,7 +66,7 @@ class BaseValidator:
                # loss
                with dt[2]:
                    if self.training:
-                        loss += trainer.criterion(preds, batch)[0]
+                        self.loss += trainer.criterion(preds, batch)[0]

                # pre-process predictions
                with dt[3]:
--- a/ultralytics/yolo/utils/callbacks/init.py
+++ b/ultralytics/yolo/utils/callbacks/init.py
@ -0,0 +1 @@
+from .base import add_integration_callbacks, default_callbacks
--- a/ultralytics/yolo/utils/callbacks/base.py
+++ b/ultralytics/yolo/utils/callbacks/base.py
@ -30,3 +30,12 @@ default_callbacks = {
    "on_val_start": on_val_start,
    "on_val_end": on_val_end,
    "on_model_save": on_model_save}
+
+
+def add_integration_callbacks(trainer):
+    callbacks = {}
+
+    from .clearml import callbacks, clearml
+    if clearml:
+        for callback, func in callbacks.items():
+            trainer.add_callback(callback, func)
--- a/ultralytics/yolo/utils/callbacks/clearml.py
+++ b/ultralytics/yolo/utils/callbacks/clearml.py
@ -0,0 +1,45 @@
+try:
+    import clearml
+    from clearml import Task
+
+    assert hasattr(clearml, '__version__')
+except (ImportError, AssertionError):
+    clearml = None
+
+
+def _log_scalers(metric_dict, group="", step=0):
+    task = Task.current_task()
+    if task:
+        for k, v in metric_dict.items():
+            task.get_logger().report_scalar(group, k, v, step)
+
+
+def before_train(trainer):
+    # TODO: reuse existing task
+    task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv5',
+                     task_name=trainer.args.name if trainer.args.name != 'exp' else 'Training',
+                     tags=['YOLOv5'],
+                     output_uri=True,
+                     reuse_last_task_id=False,
+                     auto_connect_frameworks={'pytorch': False})
+
+    task.connect(trainer.args, name='parameters')
+
+
+def on_batch_end(trainer):
+    train_loss = trainer.tloss
+    _log_scalers(trainer.label_loss_items(train_loss), "train", trainer.epoch)
+
+
+def on_val_end(trainer):
+    metrics = trainer.metrics
+    val_losses = trainer.validator.loss
+    val_loss_dict = trainer.label_loss_items(val_losses)
+    _log_scalers(val_loss_dict, "val", trainer.epoch)
+    _log_scalers(metrics, "metrics", trainer.epoch)
+
+
+callbacks = {
+    "before_train": before_train,
+    "on_val_end": on_val_end,
+    "on_batch_end": on_batch_end,}
--- a/ultralytics/yolo/utils/loggers/init.py
+++ b/ultralytics/yolo/utils/loggers/init.py
@ -1 +0,0 @@
-from .base import default_callbacks
--- a/ultralytics/yolo/v8/segment/train.py
+++ b/ultralytics/yolo/v8/segment/train.py
@ -234,6 +234,11 @@ class SegmentationTrainer(BaseTrainer):
        loss = lbox + lobj + lcls + lseg
        return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach()

+    def label_loss_items(self, loss_items):
+        # We should just use named tensors here in future
+        keys = ["lbox", "lseg", "lobj", "lcls"]
+        return dict(zip(keys, loss_items))
+
    def progress_string(self):
        return ('\n' + '%11s' * 7) % \
               ('Epoch', 'GPU_mem', 'box_loss', 'seg_loss', 'obj_loss', 'cls_loss', 'Size')
				`@ -0,0 +1 @@`
				`from .base import add_integration_callbacks, default_callbacks`