Add DVC experiments logger with DVCLive (#2792)

2023-06-05 05:10:43 -10:00
parent 2b26572e42
commit 6057b267af
3 changed files with 138 additions and 1 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,7 @@ tqdm>=4.64.0

 # Logging -------------------------------------
 # tensorboard>=2.13.0
+# dvclive>=2.11.0
 # clearml
 # comet

--- a/ultralytics/yolo/utils/callbacks/base.py
+++ b/ultralytics/yolo/utils/callbacks/base.py
@ -198,6 +198,7 @@ def add_integration_callbacks(instance):
    """
    from .clearml import callbacks as clearml_cb
    from .comet import callbacks as comet_cb
+    from .dvc import callbacks as dvc_cb
    from .hub import callbacks as hub_cb
    from .mlflow import callbacks as mlflow_cb
    from .neptune import callbacks as neptune_cb
@ -205,7 +206,7 @@ def add_integration_callbacks(instance):
    from .tensorboard import callbacks as tensorboard_cb
    from .wb import callbacks as wb_cb

-    for x in clearml_cb, comet_cb, hub_cb, mlflow_cb, neptune_cb, tune_cb, tensorboard_cb, wb_cb:
+    for x in clearml_cb, comet_cb, hub_cb, mlflow_cb, neptune_cb, tune_cb, tensorboard_cb, wb_cb, dvc_cb:
        for k, v in x.items():
            if v not in instance.callbacks[k]:  # prevent duplicate callbacks addition
                instance.callbacks[k].append(v)  # callback[name].append(func)
--- a/ultralytics/yolo/utils/callbacks/dvc.py
+++ b/ultralytics/yolo/utils/callbacks/dvc.py
@ -0,0 +1,135 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+import os
+
+from ultralytics.yolo.utils import LOGGER, TESTS_RUNNING
+from ultralytics.yolo.utils.torch_utils import get_flops, get_num_params
+
+try:
+    from importlib.metadata import version
+
+    import dvclive
+
+    assert not TESTS_RUNNING  # do not log pytest
+    assert version('dvclive')
+except (ImportError, AssertionError):
+    dvclive = None
+
+# DVCLive logger instance
+live = None
+_processed_plots = {}
+
+# `on_fit_epoch_end` is called on final validation (probably need to be fixed)
+# for now this is the way we distinguish final evaluation of the best model vs
+# last epoch validation
+_training_epoch = False
+
+
+def _logger_disabled():
+    return os.getenv('ULTRALYTICS_DVC_DISABLED', 'false').lower() == 'true'
+
+
+def _log_images(image_path, prefix=''):
+    if live:
+        live.log_image(os.path.join(prefix, image_path.name), image_path)
+
+
+def _log_plots(plots, prefix=''):
+    for name, params in plots.items():
+        timestamp = params['timestamp']
+        if _processed_plots.get(name, None) != timestamp:
+            _log_images(name, prefix)
+            _processed_plots[name] = timestamp
+
+
+def _log_confusion_matrix(validator):
+    targets = []
+    preds = []
+    matrix = validator.confusion_matrix.matrix
+    names = list(validator.names.values())
+    if validator.confusion_matrix.task == 'detect':
+        names += ['background']
+
+    for ti, pred in enumerate(matrix.T.astype(int)):
+        for pi, num in enumerate(pred):
+            targets.extend([names[ti]] * num)
+            preds.extend([names[pi]] * num)
+
+    live.log_sklearn_plot('confusion_matrix', targets, preds, name='cf.json', normalized=True)
+
+
+def on_pretrain_routine_start(trainer):
+    try:
+        global live
+        if not _logger_disabled():
+            live = dvclive.Live(save_dvc_exp=True)
+            LOGGER.info(
+                'DVCLive is detected and auto logging is enabled (can be disabled with `ULTRALYTICS_DVC_DISABLED=true`).'
+            )
+        else:
+            LOGGER.debug('DVCLive is detected and auto logging is disabled via `ULTRALYTICS_DVC_DISABLED`.')
+            live = None
+    except Exception as e:
+        LOGGER.warning(f'WARNING ⚠️ DVCLive installed but not initialized correctly, not logging this run. {e}')
+
+
+def on_pretrain_routine_end(trainer):
+    _log_plots(trainer.plots, 'train')
+
+
+def on_train_start(trainer):
+    if live:
+        live.log_params(trainer.args)
+
+
+def on_train_epoch_start(trainer):
+    global _training_epoch
+    _training_epoch = True
+
+
+def on_fit_epoch_end(trainer):
+    global _training_epoch
+    if live and _training_epoch:
+        all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr}
+        for metric, value in all_metrics.items():
+            live.log_metric(metric, value)
+
+        if trainer.epoch == 0:
+            model_info = {
+                'model/parameters': get_num_params(trainer.model),
+                'model/GFLOPs': round(get_flops(trainer.model), 3),
+                'model/speed(ms)': round(trainer.validator.speed['inference'], 3)}
+
+            for metric, value in model_info.items():
+                live.log_metric(metric, value, plot=False)
+
+        _log_plots(trainer.plots, 'train')
+        _log_plots(trainer.validator.plots, 'val')
+
+        live.next_step()
+        _training_epoch = False
+
+
+def on_train_end(trainer):
+    if live:
+        # At the end log the best metrics. It runs validator on the best model internally.
+        all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr}
+        for metric, value in all_metrics.items():
+            live.log_metric(metric, value, plot=False)
+
+        _log_plots(trainer.plots, 'eval')
+        _log_plots(trainer.validator.plots, 'eval')
+        _log_confusion_matrix(trainer.validator)
+
+        if trainer.best.exists():
+            live.log_artifact(trainer.best, copy=True)
+
+        live.end()
+
+
+callbacks = {
+    'on_pretrain_routine_start': on_pretrain_routine_start,
+    'on_pretrain_routine_end': on_pretrain_routine_end,
+    'on_train_start': on_train_start,
+    'on_train_epoch_start': on_train_epoch_start,
+    'on_fit_epoch_end': on_fit_epoch_end,
+    'on_train_end': on_train_end} if dvclive else {}