Add TensorBoard support (#87)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
single_channel
Glenn Jocher 2 years ago committed by GitHub
parent 248d54ca03
commit cb4f20f3cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -91,15 +91,15 @@ jobs:
shell: bash # for Windows compatibility shell: bash # for Windows compatibility
run: | run: |
yolo task=detect mode=train model=yolov5n.yaml data=coco128.yaml epochs=1 imgsz=64 yolo task=detect mode=train model=yolov5n.yaml data=coco128.yaml epochs=1 imgsz=64
yolo task=detect mode=val model=runs/exp/weights/last.pt imgsz=64 yolo task=detect mode=val model=runs/train/exp/weights/last.pt imgsz=64
- name: Test segmentation - name: Test segmentation
shell: bash # for Windows compatibility shell: bash # for Windows compatibility
# TODO: redo val test without hardcoded weights # TODO: redo val test without hardcoded weights
run: | run: |
yolo task=segment mode=train model=yolov5n-seg.yaml data=coco128-seg.yaml epochs=1 imgsz=64 yolo task=segment mode=train model=yolov5n-seg.yaml data=coco128-seg.yaml epochs=1 imgsz=64
yolo task=segment mode=val model=runs/exp2/weights/last.pt data=coco128-seg.yaml imgsz=64 yolo task=segment mode=val model=runs/train/exp2/weights/last.pt data=coco128-seg.yaml imgsz=64
- name: Test classification - name: Test classification
shell: bash # for Windows compatibility shell: bash # for Windows compatibility
run: | run: |
yolo task=classify mode=train model=resnet18 data=mnist160 epochs=1 imgsz=32 yolo task=classify mode=train model=resnet18 data=mnist160 epochs=1 imgsz=32
yolo task=classify mode=val model=runs/exp3/weights/last.pt data=mnist160 yolo task=classify mode=val model=runs/train/exp3/weights/last.pt data=mnist160

@ -4,7 +4,6 @@ Simple training loop; Boilerplate that could apply to any arbitrary neural netwo
import os import os
import subprocess import subprocess
import sys
import time import time
from collections import defaultdict from collections import defaultdict
from copy import deepcopy from copy import deepcopy
@ -128,6 +127,7 @@ class BaseTrainer:
Builds dataloaders and optimizer on correct rank process Builds dataloaders and optimizer on correct rank process
""" """
# model # model
self.trigger_callbacks("on_pretrain_routine_start")
ckpt = self.setup_model() ckpt = self.setup_model()
self.model = self.model.to(self.device) self.model = self.model.to(self.device)
self.set_model_attributes() self.set_model_attributes()
@ -159,13 +159,13 @@ class BaseTrainer:
# metric_keys = self.validator.metric_keys + self.label_loss_items(prefix="val") # metric_keys = self.validator.metric_keys + self.label_loss_items(prefix="val")
# self.metrics = dict(zip(metric_keys, [0] * len(metric_keys))) # TODO: init metrics for plot_results()? # self.metrics = dict(zip(metric_keys, [0] * len(metric_keys))) # TODO: init metrics for plot_results()?
self.ema = ModelEMA(self.model) self.ema = ModelEMA(self.model)
self.trigger_callbacks("on_pretrain_routine_end")
def _do_train(self, rank=-1, world_size=1): def _do_train(self, rank=-1, world_size=1):
if world_size > 1: if world_size > 1:
self._setup_ddp(rank, world_size) self._setup_ddp(rank, world_size)
self._setup_train(rank, world_size) self._setup_train(rank, world_size)
self.trigger_callbacks("before_train")
self.epoch_time = None self.epoch_time = None
self.epoch_time_start = time.time() self.epoch_time_start = time.time()
@ -173,9 +173,10 @@ class BaseTrainer:
nb = len(self.train_loader) # number of batches nb = len(self.train_loader) # number of batches
nw = max(round(self.args.warmup_epochs * nb), 100) # number of warmup iterations nw = max(round(self.args.warmup_epochs * nb), 100) # number of warmup iterations
last_opt_step = -1 last_opt_step = -1
self.trigger_callbacks("on_train_start")
for epoch in range(self.start_epoch, self.epochs): for epoch in range(self.start_epoch, self.epochs):
self.epoch = epoch self.epoch = epoch
self.trigger_callbacks("on_epoch_start") self.trigger_callbacks("on_train_epoch_start")
self.model.train() self.model.train()
if rank != -1: if rank != -1:
self.train_loader.sampler.set_epoch(epoch) self.train_loader.sampler.set_epoch(epoch)
@ -186,7 +187,7 @@ class BaseTrainer:
self.tloss = None self.tloss = None
self.optimizer.zero_grad() self.optimizer.zero_grad()
for i, batch in pbar: for i, batch in pbar:
self.trigger_callbacks("on_batch_start") self.trigger_callbacks("on_train_batch_start")
# forward # forward
batch = self.preprocess_batch(batch) batch = self.preprocess_batch(batch)
@ -207,7 +208,7 @@ class BaseTrainer:
if rank != -1: if rank != -1:
self.loss *= world_size self.loss *= world_size
self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \ self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
else self.loss_items else self.loss_items
# backward # backward
self.scaler.scale(self.loss).backward() self.scaler.scale(self.loss).backward()
@ -229,8 +230,11 @@ class BaseTrainer:
if self.args.plots and ni < 3: if self.args.plots and ni < 3:
self.plot_training_samples(batch, ni) self.plot_training_samples(batch, ni)
self.trigger_callbacks("on_train_batch_end")
lr = {f"lr{ir}": x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers lr = {f"lr{ir}": x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
self.scheduler.step() self.scheduler.step()
self.trigger_callbacks("on_train_epoch_end")
if rank in [-1, 0]: if rank in [-1, 0]:
# validation # validation
@ -260,9 +264,11 @@ class BaseTrainer:
if self.args.plots: if self.args.plots:
self.plot_metrics() self.plot_metrics()
self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)") self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)")
self.log(f"Results saved to {colorstr('bold', self.save_dir)}")
self.trigger_callbacks('on_train_end') self.trigger_callbacks('on_train_end')
dist.destroy_process_group() if world_size > 1 else None dist.destroy_process_group() if world_size > 1 else None
torch.cuda.empty_cache() torch.cuda.empty_cache()
self.trigger_callbacks('teardown')
def save_model(self): def save_model(self):
ckpt = { ckpt = {

@ -1,13 +1,36 @@
def before_train(trainer): def on_pretrain_routine_start(trainer):
# Initialize tensorboard logger
pass pass
def on_epoch_start(trainer): def on_pretrain_routine_end(trainer):
pass pass
def on_batch_start(trainer): def on_train_start(trainer):
pass
def on_train_epoch_start(trainer):
pass
def on_train_batch_start(trainer):
pass
def optimizer_step(trainer):
pass
def on_before_zero_grad(trainer):
pass
def on_train_batch_end(trainer):
pass
def on_train_epoch_end(trainer):
pass pass
@ -15,27 +38,68 @@ def on_val_start(trainer):
pass pass
def on_val_batch_start(trainer):
pass
def on_val_image_end(trainer):
pass
def on_val_batch_end(trainer):
pass
def on_val_end(trainer): def on_val_end(trainer):
pass pass
def on_fit_epoch_end(trainer):
pass
def on_model_save(trainer): def on_model_save(trainer):
pass pass
def on_train_end(trainer):
pass
def on_params_update(trainer):
pass
def teardown(trainer):
pass
default_callbacks = { default_callbacks = {
"before_train": before_train, 'on_pretrain_routine_start': on_pretrain_routine_start,
"on_epoch_start": on_epoch_start, 'on_pretrain_routine_end': on_pretrain_routine_end,
"on_batch_start": on_batch_start, 'on_train_start': on_train_start,
"on_val_start": on_val_start, 'on_train_epoch_start': on_train_epoch_start,
"on_val_end": on_val_end, 'on_train_batch_start': on_train_batch_start,
"on_model_save": on_model_save} 'optimizer_step': optimizer_step,
'on_before_zero_grad': on_before_zero_grad,
'on_train_batch_end': on_train_batch_end,
'on_train_epoch_end': on_train_epoch_end,
'on_val_start': on_val_start,
'on_val_batch_start': on_val_batch_start,
'on_val_image_end': on_val_image_end,
'on_val_batch_end': on_val_batch_end,
'on_val_end': on_val_end,
'on_fit_epoch_end': on_fit_epoch_end, # fit = train + val
'on_model_save': on_model_save,
'on_train_end': on_train_end,
'on_params_update': on_params_update,
'teardown': teardown}
def add_integration_callbacks(trainer): def add_integration_callbacks(trainer):
callbacks = {} from .clearml import callbacks as clearml_callbacks
from .tb import callbacks as tb_callbacks
from .clearml import callbacks, clearml for x in tb_callbacks, clearml_callbacks:
if clearml: for k, v in x.items():
for callback, func in callbacks.items(): trainer.add_callback(k, v) # add_callback(name, func)
trainer.add_callback(callback, func)

@ -9,47 +9,33 @@ except (ImportError, AssertionError):
clearml = None clearml = None
def _log_scalers(metric_dict, group="", step=0): def on_train_start(trainer):
task = Task.current_task()
if task:
for k, v in metric_dict.items():
task.get_logger().report_scalar(group, k, v, step)
def before_train(trainer):
# TODO: reuse existing task # TODO: reuse existing task
task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv5', task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv8',
task_name=trainer.args.name if trainer.args.name != 'exp' else 'Training', task_name=trainer.args.name,
tags=['YOLOv5'], tags=['YOLOv8'],
output_uri=True, output_uri=True,
reuse_last_task_id=False, reuse_last_task_id=False,
auto_connect_frameworks={'pytorch': False}) auto_connect_frameworks={'pytorch': False})
task.connect(dict(trainer.args), name='General') task.connect(dict(trainer.args), name='General')
def on_batch_end(trainer):
_log_scalers(trainer.label_loss_items(trainer.tloss, prefix="train"), "train", trainer.epoch)
def on_val_end(trainer): def on_val_end(trainer):
_log_scalers(trainer.label_loss_items(trainer.validator.loss, prefix="val"), "val", trainer.epoch)
_log_scalers({k: v for k, v in trainer.metrics.items() if k.startswith("metrics")}, "metrics", trainer.epoch)
if trainer.epoch == 0: if trainer.epoch == 0:
model_info = { model_info = {
"inference_speed": trainer.validator.speed[1], "Inference speed (ms/img)": round(trainer.validator.speed[1], 1),
"flops@640": get_flops(trainer.model), "GFLOPs": round(get_flops(trainer.model), 1),
"params": get_num_params(trainer.model)} "Parameters": get_num_params(trainer.model)}
Task.current_task().connect(model_info, 'Model') Task.current_task().connect(model_info, name='Model')
def on_train_end(trainer): def on_train_end(trainer):
task = Task.current_task() Task.current_task().update_output_model(model_path=str(trainer.best),
if task: model_name=trainer.args.name,
task.update_output_model(model_path=str(trainer.best), model_name='Best Model', auto_delete_file=False) auto_delete_file=False)
callbacks = { callbacks = {
"before_train": before_train, "on_train_start": on_train_start,
"on_val_end": on_val_end, "on_val_end": on_val_end,
"on_batch_end": on_batch_end, "on_train_end": on_train_end} if clearml else {}
"on_train_end": on_train_end}

@ -0,0 +1,26 @@
from torch.utils.tensorboard import SummaryWriter
writer = None # TensorBoard SummaryWriter instance
def _log_scalars(scalars, step=0):
for k, v in scalars.items():
writer.add_scalar(k, v, step)
def on_train_start(trainer):
global writer
writer = SummaryWriter(str(trainer.save_dir))
trainer.console.info(f"Logging results to {trainer.save_dir}\n"
f"Starting training for {trainer.args.epochs} epochs...")
def on_batch_end(trainer):
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch)
def on_val_end(trainer):
_log_scalars(trainer.metrics, trainer.epoch)
callbacks = {"on_train_start": on_train_start, "on_val_end": on_val_end, "on_batch_end": on_batch_end}

@ -15,7 +15,7 @@ nosave: False
cache: False # True/ram, disk or False cache: False # True/ram, disk or False
device: '' # cuda device, i.e. 0 or 0,1,2,3 or cpu device: '' # cuda device, i.e. 0 or 0,1,2,3 or cpu
workers: 8 workers: 8
project: 'runs' project: 'runs/train'
name: 'exp' name: 'exp'
exist_ok: False exist_ok: False
pretrained: False pretrained: False

Loading…
Cancel
Save