From cb4f20f3cf04c897538481fe40572595ccd7a4d5 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 24 Dec 2022 14:37:46 +0100 Subject: [PATCH] Add TensorBoard support (#87) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/ci.yaml | 6 +- ultralytics/yolo/engine/trainer.py | 16 ++-- ultralytics/yolo/utils/callbacks/base.py | 94 +++++++++++++++++---- ultralytics/yolo/utils/callbacks/clearml.py | 40 +++------ ultralytics/yolo/utils/callbacks/tb.py | 26 ++++++ ultralytics/yolo/utils/configs/default.yaml | 2 +- 6 files changed, 133 insertions(+), 51 deletions(-) create mode 100644 ultralytics/yolo/utils/callbacks/tb.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f6f7b83..49d0c47 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -91,15 +91,15 @@ jobs: shell: bash # for Windows compatibility run: | yolo task=detect mode=train model=yolov5n.yaml data=coco128.yaml epochs=1 imgsz=64 - yolo task=detect mode=val model=runs/exp/weights/last.pt imgsz=64 + yolo task=detect mode=val model=runs/train/exp/weights/last.pt imgsz=64 - name: Test segmentation shell: bash # for Windows compatibility # TODO: redo val test without hardcoded weights run: | yolo task=segment mode=train model=yolov5n-seg.yaml data=coco128-seg.yaml epochs=1 imgsz=64 - yolo task=segment mode=val model=runs/exp2/weights/last.pt data=coco128-seg.yaml imgsz=64 + yolo task=segment mode=val model=runs/train/exp2/weights/last.pt data=coco128-seg.yaml imgsz=64 - name: Test classification shell: bash # for Windows compatibility run: | yolo task=classify mode=train model=resnet18 data=mnist160 epochs=1 imgsz=32 - yolo task=classify mode=val model=runs/exp3/weights/last.pt data=mnist160 + yolo task=classify mode=val model=runs/train/exp3/weights/last.pt data=mnist160 diff --git a/ultralytics/yolo/engine/trainer.py b/ultralytics/yolo/engine/trainer.py index 5f603f8..bcb64f5 100644 --- a/ultralytics/yolo/engine/trainer.py +++ b/ultralytics/yolo/engine/trainer.py @@ -4,7 +4,6 @@ Simple training loop; Boilerplate that could apply to any arbitrary neural netwo import os import subprocess -import sys import time from collections import defaultdict from copy import deepcopy @@ -128,6 +127,7 @@ class BaseTrainer: Builds dataloaders and optimizer on correct rank process """ # model + self.trigger_callbacks("on_pretrain_routine_start") ckpt = self.setup_model() self.model = self.model.to(self.device) self.set_model_attributes() @@ -159,13 +159,13 @@ class BaseTrainer: # metric_keys = self.validator.metric_keys + self.label_loss_items(prefix="val") # self.metrics = dict(zip(metric_keys, [0] * len(metric_keys))) # TODO: init metrics for plot_results()? self.ema = ModelEMA(self.model) + self.trigger_callbacks("on_pretrain_routine_end") def _do_train(self, rank=-1, world_size=1): if world_size > 1: self._setup_ddp(rank, world_size) self._setup_train(rank, world_size) - self.trigger_callbacks("before_train") self.epoch_time = None self.epoch_time_start = time.time() @@ -173,9 +173,10 @@ class BaseTrainer: nb = len(self.train_loader) # number of batches nw = max(round(self.args.warmup_epochs * nb), 100) # number of warmup iterations last_opt_step = -1 + self.trigger_callbacks("on_train_start") for epoch in range(self.start_epoch, self.epochs): self.epoch = epoch - self.trigger_callbacks("on_epoch_start") + self.trigger_callbacks("on_train_epoch_start") self.model.train() if rank != -1: self.train_loader.sampler.set_epoch(epoch) @@ -186,7 +187,7 @@ class BaseTrainer: self.tloss = None self.optimizer.zero_grad() for i, batch in pbar: - self.trigger_callbacks("on_batch_start") + self.trigger_callbacks("on_train_batch_start") # forward batch = self.preprocess_batch(batch) @@ -207,7 +208,7 @@ class BaseTrainer: if rank != -1: self.loss *= world_size self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \ - else self.loss_items + else self.loss_items # backward self.scaler.scale(self.loss).backward() @@ -229,8 +230,11 @@ class BaseTrainer: if self.args.plots and ni < 3: self.plot_training_samples(batch, ni) + self.trigger_callbacks("on_train_batch_end") + lr = {f"lr{ir}": x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers self.scheduler.step() + self.trigger_callbacks("on_train_epoch_end") if rank in [-1, 0]: # validation @@ -260,9 +264,11 @@ class BaseTrainer: if self.args.plots: self.plot_metrics() self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)") + self.log(f"Results saved to {colorstr('bold', self.save_dir)}") self.trigger_callbacks('on_train_end') dist.destroy_process_group() if world_size > 1 else None torch.cuda.empty_cache() + self.trigger_callbacks('teardown') def save_model(self): ckpt = { diff --git a/ultralytics/yolo/utils/callbacks/base.py b/ultralytics/yolo/utils/callbacks/base.py index 3cffa12..671c91f 100644 --- a/ultralytics/yolo/utils/callbacks/base.py +++ b/ultralytics/yolo/utils/callbacks/base.py @@ -1,13 +1,36 @@ -def before_train(trainer): - # Initialize tensorboard logger +def on_pretrain_routine_start(trainer): pass -def on_epoch_start(trainer): +def on_pretrain_routine_end(trainer): pass -def on_batch_start(trainer): +def on_train_start(trainer): + pass + + +def on_train_epoch_start(trainer): + pass + + +def on_train_batch_start(trainer): + pass + + +def optimizer_step(trainer): + pass + + +def on_before_zero_grad(trainer): + pass + + +def on_train_batch_end(trainer): + pass + + +def on_train_epoch_end(trainer): pass @@ -15,27 +38,68 @@ def on_val_start(trainer): pass +def on_val_batch_start(trainer): + pass + + +def on_val_image_end(trainer): + pass + + +def on_val_batch_end(trainer): + pass + + def on_val_end(trainer): pass +def on_fit_epoch_end(trainer): + pass + + def on_model_save(trainer): pass +def on_train_end(trainer): + pass + + +def on_params_update(trainer): + pass + + +def teardown(trainer): + pass + + default_callbacks = { - "before_train": before_train, - "on_epoch_start": on_epoch_start, - "on_batch_start": on_batch_start, - "on_val_start": on_val_start, - "on_val_end": on_val_end, - "on_model_save": on_model_save} + 'on_pretrain_routine_start': on_pretrain_routine_start, + 'on_pretrain_routine_end': on_pretrain_routine_end, + 'on_train_start': on_train_start, + 'on_train_epoch_start': on_train_epoch_start, + 'on_train_batch_start': on_train_batch_start, + 'optimizer_step': optimizer_step, + 'on_before_zero_grad': on_before_zero_grad, + 'on_train_batch_end': on_train_batch_end, + 'on_train_epoch_end': on_train_epoch_end, + 'on_val_start': on_val_start, + 'on_val_batch_start': on_val_batch_start, + 'on_val_image_end': on_val_image_end, + 'on_val_batch_end': on_val_batch_end, + 'on_val_end': on_val_end, + 'on_fit_epoch_end': on_fit_epoch_end, # fit = train + val + 'on_model_save': on_model_save, + 'on_train_end': on_train_end, + 'on_params_update': on_params_update, + 'teardown': teardown} def add_integration_callbacks(trainer): - callbacks = {} + from .clearml import callbacks as clearml_callbacks + from .tb import callbacks as tb_callbacks - from .clearml import callbacks, clearml - if clearml: - for callback, func in callbacks.items(): - trainer.add_callback(callback, func) + for x in tb_callbacks, clearml_callbacks: + for k, v in x.items(): + trainer.add_callback(k, v) # add_callback(name, func) diff --git a/ultralytics/yolo/utils/callbacks/clearml.py b/ultralytics/yolo/utils/callbacks/clearml.py index e1d1ece..8d4bfe8 100644 --- a/ultralytics/yolo/utils/callbacks/clearml.py +++ b/ultralytics/yolo/utils/callbacks/clearml.py @@ -9,47 +9,33 @@ except (ImportError, AssertionError): clearml = None -def _log_scalers(metric_dict, group="", step=0): - task = Task.current_task() - if task: - for k, v in metric_dict.items(): - task.get_logger().report_scalar(group, k, v, step) - - -def before_train(trainer): +def on_train_start(trainer): # TODO: reuse existing task - task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv5', - task_name=trainer.args.name if trainer.args.name != 'exp' else 'Training', - tags=['YOLOv5'], + task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv8', + task_name=trainer.args.name, + tags=['YOLOv8'], output_uri=True, reuse_last_task_id=False, auto_connect_frameworks={'pytorch': False}) task.connect(dict(trainer.args), name='General') -def on_batch_end(trainer): - _log_scalers(trainer.label_loss_items(trainer.tloss, prefix="train"), "train", trainer.epoch) - - def on_val_end(trainer): - _log_scalers(trainer.label_loss_items(trainer.validator.loss, prefix="val"), "val", trainer.epoch) - _log_scalers({k: v for k, v in trainer.metrics.items() if k.startswith("metrics")}, "metrics", trainer.epoch) if trainer.epoch == 0: model_info = { - "inference_speed": trainer.validator.speed[1], - "flops@640": get_flops(trainer.model), - "params": get_num_params(trainer.model)} - Task.current_task().connect(model_info, 'Model') + "Inference speed (ms/img)": round(trainer.validator.speed[1], 1), + "GFLOPs": round(get_flops(trainer.model), 1), + "Parameters": get_num_params(trainer.model)} + Task.current_task().connect(model_info, name='Model') def on_train_end(trainer): - task = Task.current_task() - if task: - task.update_output_model(model_path=str(trainer.best), model_name='Best Model', auto_delete_file=False) + Task.current_task().update_output_model(model_path=str(trainer.best), + model_name=trainer.args.name, + auto_delete_file=False) callbacks = { - "before_train": before_train, + "on_train_start": on_train_start, "on_val_end": on_val_end, - "on_batch_end": on_batch_end, - "on_train_end": on_train_end} + "on_train_end": on_train_end} if clearml else {} diff --git a/ultralytics/yolo/utils/callbacks/tb.py b/ultralytics/yolo/utils/callbacks/tb.py new file mode 100644 index 0000000..a86a0d6 --- /dev/null +++ b/ultralytics/yolo/utils/callbacks/tb.py @@ -0,0 +1,26 @@ +from torch.utils.tensorboard import SummaryWriter + +writer = None # TensorBoard SummaryWriter instance + + +def _log_scalars(scalars, step=0): + for k, v in scalars.items(): + writer.add_scalar(k, v, step) + + +def on_train_start(trainer): + global writer + writer = SummaryWriter(str(trainer.save_dir)) + trainer.console.info(f"Logging results to {trainer.save_dir}\n" + f"Starting training for {trainer.args.epochs} epochs...") + + +def on_batch_end(trainer): + _log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch) + + +def on_val_end(trainer): + _log_scalars(trainer.metrics, trainer.epoch) + + +callbacks = {"on_train_start": on_train_start, "on_val_end": on_val_end, "on_batch_end": on_batch_end} diff --git a/ultralytics/yolo/utils/configs/default.yaml b/ultralytics/yolo/utils/configs/default.yaml index c530516..5ef0de6 100644 --- a/ultralytics/yolo/utils/configs/default.yaml +++ b/ultralytics/yolo/utils/configs/default.yaml @@ -15,7 +15,7 @@ nosave: False cache: False # True/ram, disk or False device: '' # cuda device, i.e. 0 or 0,1,2,3 or cpu workers: 8 -project: 'runs' +project: 'runs/train' name: 'exp' exist_ok: False pretrained: False