Integration of v8 segmentation (#107)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2022-12-28 23:01:38 +08:00
parent 384f0ef1c6
commit 8406b49b49
16 changed files with 422 additions and 224 deletions
--- a/.gitignore
+++ b/.gitignore
@ -134,3 +134,17 @@ runs/
 wandb/
 .DS_Store
 # Neural Network weights -----------------------------------------------------------------------------------------------
 *.weights
 *.pt
 *.pb
 *.onnx
 *.engine
 *.mlmodel
 *.torchscript
 *.tflite
 *.h5
 *_saved_model/
 *_web_model/
 *_openvino_model/
--- a/tests/check_flops.py
+++ b/tests/check_flops.py
@ -0,0 +1,67 @@
 import torch
 from ultralytics import YOLO
 from ultralytics.nn.modules import Detect, Segment
 def export_onnx(model, file):
    # YOLOv5 ONNX export
    import onnx
    im = torch.zeros(1, 3, 640, 640)
    model.eval()
    model(im, profile=True)
    for k, m in model.named_modules():
        if isinstance(m, (Detect, Segment)):
            m.export = True
    torch.onnx.export(
        model,
        im,
        file,
        verbose=False,
        opset_version=12,
        do_constant_folding=True,  # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False
        input_names=['images'])
    # Checks
    model_onnx = onnx.load(file)  # load onnx model
    onnx.checker.check_model(model_onnx)  # check onnx model
    # Metadata
    d = {'stride': int(max(model.stride)), 'names': model.names}
    for k, v in d.items():
        meta = model_onnx.metadata_props.add()
        meta.key, meta.value = k, str(v)
    onnx.save(model_onnx, file)
 if __name__ == "__main__":
    model = YOLO()
    print("yolov8n")
    model.new("yolov8n.yaml")
    print("yolov8n-seg")
    model.new("yolov8n-seg.yaml")
    print("yolov8s")
    model.new("yolov8s.yaml")
    # export_onnx(model.model, "yolov8s.onnx")
    print("yolov8s-seg")
    model.new("yolov8s-seg.yaml")
    # export_onnx(model.model, "yolov8s-seg.onnx")
    print("yolov8m")
    model.new("yolov8m.yaml")
    print("yolov8m-seg")
    model.new("yolov8m-seg.yaml")
    print("yolov8l")
    model.new("yolov8l.yaml")
    print("yolov8l-seg")
    model.new("yolov8l-seg.yaml")
    print("yolov8x")
    model.new("yolov8x.yaml")
    print("yolov8x-seg")
    model.new("yolov8x-seg.yaml")
    # n vs n-seg: 8.9GFLOPs vs 12.8GFLOPs, 3.16M vs 3.6M. ch[0] // 4 (11.9GFLOPs, 3.39M)
    # s vs s-seg: 28.8GFLOPs vs 44.4GFLOPs, 11.1M vs 12.9M. ch[0] // 4 (39.5GFLOPs, 11.7M)
    # m vs m-seg: 79.3GFLOPs vs 113.8GFLOPs, 25.9M vs 29.5M. ch[0] // 4 (103.GFLOPs, 27.1M)
    # l vs l-seg: 165.7GFLOPs vs 226.3GFLOPs, 43.7M vs 49.6M. ch[0] // 4 (207GFLOPs, 45.7M)
    # x vs x-seg: 258.5GFLOPs vs 353.0GFLOPs, 68.3M vs 77.5M. ch[0] // 4 (324GFLOPs, 71.4M)
--- a/ultralytics/nn/modules.py
+++ b/ultralytics/nn/modules.py
@ -576,11 +576,11 @@ class Detections:
 class Proto(nn.Module):
-    # YOLOv5 mask Proto module for segmentation models
+    # YOLOv8 mask Proto module for segmentation models
    def __init__(self, c1, c_=256, c2=32):  # ch_in, number of protos, number of masks
        super().__init__()
        self.cv1 = Conv(c1, c_, k=3)
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
        self.cv2 = Conv(c_, c_, k=3)
        self.cv3 = Conv(c_, c2)
@ -628,16 +628,16 @@ class Detect(nn.Module):
        shape = x[0].shape  # BCHW
        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
        box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.nc), 1)
        if self.training:
-            return x, box, cls
+            return x
        elif self.dynamic or self.shape != shape:
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape
        box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.nc), 1)
        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
        y = torch.cat((dbox, cls.sigmoid()), 1)
-        return y if self.export else (y, (x, box, cls))
+        return y if self.export else (y, x)
    def bias_init(self):
        # Initialize Detect() biases, WARNING: requires stride availability
@ -651,19 +651,27 @@ class Detect(nn.Module):
 class Segment(Detect):
    # YOLOv5 Segment head for segmentation models
-    def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=()):
+    def __init__(self, nc=80, nm=32, npr=256, ch=()):
-        super().__init__(nc, anchors, ch)
+        super().__init__(nc, ch)
        self.nm = nm  # number of masks
        self.npr = npr  # number of protos
        self.no = 5 + nc + self.nm  # number of outputs per anchor
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
        self.proto = Proto(ch[0], self.npr, self.nm)  # protos
        self.detect = Detect.forward
        c4 = max(ch[0] // 4, self.nm)
        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
    def forward(self, x):
        p = self.proto(x[0])
        mc = []  # mask coefficient
        for i in range(self.nl):
            mc.append(self.cv4[i](x[i]))
        mc = torch.cat([mi.view(p.shape[0], self.nm, -1) for mi in mc], 2)
        x = self.detect(self, x)
-        return (x, p) if self.training else (x[0], p) if self.export else (x[0], p, x[1])
+        if self.training:
            return x, mc, p
        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
 class Classify(nn.Module):
--- a/ultralytics/nn/tasks.py
+++ b/ultralytics/nn/tasks.py
@ -101,7 +101,7 @@ class DetectionModel(BaseModel):
        if isinstance(m, (Detect, Segment)):
            s = 256  # 2x min stride
            m.inplace = self.inplace
-            forward = lambda x: self.forward(x)[0] if isinstance(m, (Segment, Detect)) else self.forward(x)
+            forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x)
            m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))])  # forward
            self.stride = m.stride
            m.bias_init()  # only run once
@ -163,8 +163,8 @@ class DetectionModel(BaseModel):
 class SegmentationModel(DetectionModel):
    # YOLOv5 segmentation model
-    def __init__(self, cfg='yolov5s-seg.yaml', ch=3, nc=None):
+    def __init__(self, cfg='yolov5s-seg.yaml', ch=3, nc=None, verbose=True):
-        super().__init__(cfg, ch, nc)
+        super().__init__(cfg, ch, nc, verbose)
 class ClassificationModel(BaseModel):
@ -300,7 +300,7 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
        elif m in {Detect, Segment}:
            args.append([ch[x] for x in f])
            if m is Segment:
-                args[3] = make_divisible(args[3] * gw, 8)
+                args[2] = make_divisible(args[2] * gw, 8)
        else:
            c2 = ch[f]
--- a/ultralytics/yolo/init.py
+++ b/ultralytics/yolo/init.py
@ -0,0 +1 @@
 from . import v8
--- a/ultralytics/yolo/utils/configs/default.yaml
+++ b/ultralytics/yolo/utils/configs/default.yaml
@ -3,7 +3,7 @@
 # Task and Mode
 task: "classify"  # choices=['detect', 'segment', 'classify', 'init'] # init is a special case
-mode: "train"  # choice=['train', 'val', 'infer']
+mode: "train"  # choice=['train', 'val', 'predict']
 # Train settings -------------------------------------------------------------------------------------------------------
 model: null  # i.e. yolov5s.pt, yolo.yaml
--- a/ultralytics/yolo/utils/tal.py
+++ b/ultralytics/yolo/utils/tal.py
@ -86,7 +86,8 @@ class TaskAlignedAssigner(nn.Module):
        if self.n_max_boxes == 0:
            device = gt_bboxes.device
            return (torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), torch.zeros_like(pd_bboxes).to(device),
-                    torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device))
+                    torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device),
                    torch.zeros_like(pd_scores[..., 0]).to(device))
        mask_pos, align_metric, overlaps = self.get_pos_mask(pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points,
                                                             mask_gt)
@ -103,7 +104,7 @@ class TaskAlignedAssigner(nn.Module):
        norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
        target_scores = target_scores * norm_align_metric
-        return target_labels, target_bboxes, target_scores, fg_mask.bool()
+        return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx
    def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt):
        # get anchor_align metric, (b, max_num_obj, h*w)
@ -146,9 +147,6 @@ class TaskAlignedAssigner(nn.Module):
        # (b, max_num_obj, topk, h*w) -> (b, max_num_obj, h*w)
        is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(-2)
        # filter invalid bboxes
        # assigned topk should be unique, this is for dealing with empty labels
        # since empty labels will generate index `0` through `F.one_hot`
        # NOTE: but what if the topk_idxs include `0`?
        is_in_topk = torch.where(is_in_topk > 1, 0, is_in_topk)
        return is_in_topk.to(metrics.dtype)
--- a/ultralytics/yolo/v8/detect/train.py
+++ b/ultralytics/yolo/v8/detect/train.py
@ -9,11 +9,10 @@ from ultralytics.yolo.data.dataloaders.v5loader import create_dataloader
 from ultralytics.yolo.engine.trainer import DEFAULT_CONFIG, BaseTrainer
 from ultralytics.yolo.utils import colorstr
 from ultralytics.yolo.utils.loss import BboxLoss
 from ultralytics.yolo.utils.metrics import smooth_BCE
 from ultralytics.yolo.utils.ops import xywh2xyxy
 from ultralytics.yolo.utils.plotting import plot_images, plot_results
 from ultralytics.yolo.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors
-from ultralytics.yolo.utils.torch_utils import de_parallel, strip_optimizer
+from ultralytics.yolo.utils.torch_utils import de_parallel
 # BaseTrainer python usage
@ -78,7 +77,8 @@ class DetectionTrainer(BaseTrainer):
        return dict(zip(keys, loss_items)) if loss_items is not None else keys
    def progress_string(self):
-        return ('\n' + '%11s' * 7) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')
+        return ('\n' + '%11s' *
                (4 + len(self.loss_names))) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')
    def plot_training_samples(self, batch, ni):
        plot_images(images=batch["img"],
@ -100,15 +100,13 @@ class Loss:
        device = next(model.parameters()).device  # get model device
        h = model.args  # hyperparameters
        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
        self.cp, self.cn = smooth_BCE(eps=h.get("label_smoothing", 0.0))  # positive, negative BCE targets
        m = model.model[-1]  # Detect() module
        self.bce = nn.BCEWithLogitsLoss(reduction='none')
        self.hyp = h
        self.stride = m.stride  # model strides
        self.nc = m.nc  # number of classes
-        self.nl = m.nl  # number of layers
+        self.no = m.no
        self.reg_max = m.reg_max
        self.device = device
        self.use_dfl = m.reg_max > 1
@ -141,12 +139,15 @@ class Loss:
    def __call__(self, preds, batch):
        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
-        feats, pred_distri, pred_scores = preds if len(preds) == 3 else preds[1]
+        feats = preds[1] if isinstance(preds, tuple) else preds
        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
            (self.reg_max * 4, self.nc), 1)
        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
        dtype = pred_scores.dtype
-        batch_size, grid_size = pred_scores.shape[:2]
+        batch_size = pred_scores.shape[0]
        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
@ -159,7 +160,7 @@ class Loss:
        # pboxes
        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
-        target_labels, target_bboxes, target_scores, fg_mask = self.assigner(
+        _, target_bboxes, target_scores, fg_mask, _ = self.assigner(
            pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
            anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt)
--- a/ultralytics/yolo/v8/models/seg/yolov8l-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8l-seg.yaml
@ -0,0 +1,42 @@
 # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
 # Parameters
 nc: 80  # number of classes
 depth_multiple: 1.00  # model depth multiple
 width_multiple: 1.00  # layer channel multiple
 # YOLOv8.0l backbone
 backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C2f, [128, True]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C2f, [256, True]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 6, C2f, [512, True]],
   [-1, 1, Conv, [512, 3, 2]],  # 7-P5/32
   [-1, 3, C2f, [512, True]],
   [-1, 1, SPPF, [512, 5]],  # 9
  ]
 # YOLOv8.0l head
 head:
  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C2f, [512]],  # 13
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 12], 1, Concat, [1]],  # cat head P4
   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 9], 1, Concat, [1]],  # cat head P5
   [-1, 3, C2f, [512]],  # 23 (P5/32-large)
   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
  ]
--- a/ultralytics/yolo/v8/models/seg/yolov8m-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8m-seg.yaml
@ -0,0 +1,42 @@
 # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
 # Parameters
 nc: 80  # number of classes
 depth_multiple: 0.67  # model depth multiple
 width_multiple: 0.75  # layer channel multiple
 # YOLOv8.0m backbone
 backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C2f, [128, True]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C2f, [256, True]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 6, C2f, [512, True]],
   [-1, 1, Conv, [768, 3, 2]],  # 7-P5/32
   [-1, 3, C2f, [768, True]],
   [-1, 1, SPPF, [768, 5]],  # 9
  ]
 # YOLOv8.0m head
 head:
  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C2f, [512]],  # 13
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 12], 1, Concat, [1]],  # cat head P4
   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 9], 1, Concat, [1]],  # cat head P5
   [-1, 3, C2f, [768]],  # 23 (P5/32-large)
   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
  ]
--- a/ultralytics/yolo/v8/models/seg/yolov8n-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8n-seg.yaml
@ -4,9 +4,8 @@
 nc: 80  # number of classes
 depth_multiple: 0.33  # model depth multiple
 width_multiple: 0.25  # layer channel multiple
 anchors: [[16,19], [55,65], [178,192]]
-# YOLOv8n v0.0 backbone
+# YOLOv8.0n backbone
 backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
@ -21,7 +20,7 @@ backbone:
   [-1, 1, SPPF, [1024, 5]],  # 9
  ]
-# YOLOv8n v0.0 head
+# YOLOv8.0n head
 head:
  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
--- a/ultralytics/yolo/v8/models/seg/yolov8s-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8s-seg.yaml
@ -0,0 +1,42 @@
 # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
 # Parameters
 nc: 80  # number of classes
 depth_multiple: 0.33  # model depth multiple
 width_multiple: 0.50  # layer channel multiple
 # YOLOv8.0s backbone
 backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C2f, [128, True]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C2f, [256, True]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 6, C2f, [512, True]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 3, C2f, [1024, True]],
   [-1, 1, SPPF, [1024, 5]],  # 9
  ]
 # YOLOv8.0s head
 head:
  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C2f, [512]],  # 13
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 12], 1, Concat, [1]],  # cat head P4
   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 9], 1, Concat, [1]],  # cat head P5
   [-1, 3, C2f, [1024]],  # 23 (P5/32-large)
   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
  ]
--- a/ultralytics/yolo/v8/models/seg/yolov8x-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8x-seg.yaml
@ -0,0 +1,42 @@
 # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
 # Parameters
 nc: 80  # number of classes
 depth_multiple: 1.00  # model depth multiple
 width_multiple: 1.25  # layer channel multiple
 # YOLOv8.0x backbone
 backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C2f, [128, True]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C2f, [256, True]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 6, C2f, [512, True]],
   [-1, 1, Conv, [512, 3, 2]],  # 7-P5/32
   [-1, 3, C2f, [512, True]],
   [-1, 1, SPPF, [512, 5]],  # 9
  ]
 # YOLOv8.0x head
 head:
  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C2f, [512]],  # 13
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 12], 1, Concat, [1]],  # cat head P4
   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 9], 1, Concat, [1]],  # cat head P5
   [-1, 3, C2f, [512]],  # 23 (P5/32-large)
   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
  ]
--- a/ultralytics/yolo/v8/segment/predict.py
+++ b/ultralytics/yolo/v8/segment/predict.py
@ -12,17 +12,14 @@ class SegmentationPredictor(DetectionPredictor):
    def postprocess(self, preds, img, orig_img):
        masks = []
        if len(preds) == 2:  # eval
            p, proto, = preds
        else:  # len(3) train
            p, proto, _ = preds
        # TODO: filter by classes
-        p = ops.non_max_suppression(p,
+        p = ops.non_max_suppression(preds[0],
                                    self.args.conf_thres,
                                    self.args.iou_thres,
                                    agnostic=self.args.agnostic_nms,
                                    max_det=self.args.max_det,
                                    nm=32)
        proto = preds[1][-1]
        for i, pred in enumerate(p):
            shape = orig_img[i].shape if self.webcam else orig_img.shape
            if not len(pred):
--- a/ultralytics/yolo/v8/segment/train.py
+++ b/ultralytics/yolo/v8/segment/train.py
@ -6,9 +6,10 @@ import torch.nn.functional as F
 from ultralytics.nn.tasks import SegmentationModel
 from ultralytics.yolo import v8
 from ultralytics.yolo.engine.trainer import DEFAULT_CONFIG
-from ultralytics.yolo.utils.metrics import FocalLoss, bbox_iou, smooth_BCE
+from ultralytics.yolo.utils.loss import BboxLoss
-from ultralytics.yolo.utils.ops import crop_mask, xywh2xyxy
+from ultralytics.yolo.utils.ops import crop_mask, xywh2xyxy, xyxy2xywh
 from ultralytics.yolo.utils.plotting import plot_images, plot_results
 from ultralytics.yolo.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors
 from ultralytics.yolo.utils.torch_utils import de_parallel
 from ..detect import DetectionTrainer
@ -31,188 +32,9 @@ class SegmentationTrainer(DetectionTrainer):
                                                args=self.args)
    def criterion(self, preds, batch):
-        head = de_parallel(self.model).model[-1]
+        if not hasattr(self, 'compute_loss'):
-        sort_obj_iou = False
+            self.compute_loss = SegLoss(de_parallel(self.model), overlap=self.args.overlap_mask)
-        autobalance = False
+        return self.compute_loss(preds, batch)
        # init losses
        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([self.args.cls_pw], device=self.device))
        BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([self.args.obj_pw], device=self.device))
        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
        cp, cn = smooth_BCE(eps=self.args.label_smoothing)  # positive, negative BCE targets
        # Focal loss
        g = self.args.fl_gamma
        if self.args.fl_gamma > 0:
            BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
        balance = {3: [4.0, 1.0, 0.4]}.get(head.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
        ssi = list(head.stride).index(16) if autobalance else 0  # stride 16 index
        BCEcls, BCEobj, gr, autobalance = BCEcls, BCEobj, 1.0, autobalance
        def single_mask_loss(gt_mask, pred, proto, xyxy, area):
            # Mask loss for one image
            pred_mask = (pred @ proto.view(head.nm, -1)).view(-1, *proto.shape[1:])  # (n,32) @ (32,80,80) -> (n,80,80)
            loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
            return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean()
        def build_targets(p, targets):
            # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
            nonlocal head
            na, nt = head.na, targets.shape[0]  # number of anchors, targets
            tcls, tbox, indices, anch, tidxs, xywhn = [], [], [], [], [], []
            gain = torch.ones(8, device=self.device)  # normalized to gridspace gain
            ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1,
                                                                                 nt)  # same as .repeat_interleave(nt)
            if self.args.overlap_mask:
                batch = p[0].shape[0]
                ti = []
                for i in range(batch):
                    num = (targets[:, 0] == i).sum()  # find number of targets of each image
                    ti.append(torch.arange(num, device=self.device).float().view(1, num).repeat(na, 1) + 1)  # (na, num)
                ti = torch.cat(ti, 1)  # (na, nt)
            else:
                ti = torch.arange(nt, device=self.device).float().view(1, nt).repeat(na, 1)
            targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None], ti[..., None]), 2)  # append anchor indices
            g = 0.5  # bias
            off = torch.tensor(
                [
                    [0, 0],
                    [1, 0],
                    [0, 1],
                    [-1, 0],
                    [0, -1],  # j,k,l,m
                    # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
                ],
                device=self.device).float() * g  # offsets
            for i in range(head.nl):
                anchors, shape = head.anchors[i], p[i].shape
                gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]]  # xyxy gain
                # Match targets to anchors
                t = targets * gain  # shape(3,n,7)
                if nt:
                    # Matches
                    r = t[..., 4:6] / anchors[:, None]  # wh ratio
                    j = torch.max(r, 1 / r).max(2)[0] < self.args.anchor_t  # compare
                    # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
                    t = t[j]  # filter
                    # Offsets
                    gxy = t[:, 2:4]  # grid xy
                    gxi = gain[[2, 3]] - gxy  # inverse
                    j, k = ((gxy % 1 < g) & (gxy > 1)).T
                    l, m = ((gxi % 1 < g) & (gxi > 1)).T
                    j = torch.stack((torch.ones_like(j), j, k, l, m))
                    t = t.repeat((5, 1, 1))[j]
                    offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
                else:
                    t = targets[0]
                    offsets = 0
                # Define
                bc, gxy, gwh, at = t.chunk(4, 1)  # (image, class), grid xy, grid wh, anchors
                (a, tidx), (b, c) = at.long().T, bc.long().T  # anchors, image, class
                gij = (gxy - offsets).long()
                gi, gj = gij.T  # grid indices
                # Append
                indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1)))  # image, anchor, grid
                tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
                anch.append(anchors[a])  # anchors
                tcls.append(c)  # class
                tidxs.append(tidx)
                xywhn.append(torch.cat((gxy, gwh), 1) / gain[2:6])  # xywh normalized
            return tcls, tbox, indices, anch, tidxs, xywhn
        if len(preds) == 2:  # eval
            p, proto, = preds
        else:  # len(3) train
            _, proto, p = preds
        targets = torch.cat((batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), 1)
        masks = batch["masks"]
        targets, masks = targets.to(self.device), masks.to(self.device).float()
        bs, nm, mask_h, mask_w = proto.shape  # batch size, number of masks, mask height, mask width
        lcls = torch.zeros(1, device=self.device)
        lbox = torch.zeros(1, device=self.device)
        lobj = torch.zeros(1, device=self.device)
        lseg = torch.zeros(1, device=self.device)
        tcls, tbox, indices, anchors, tidxs, xywhn = build_targets(p, targets)
        # Losses
        for i, pi in enumerate(p):  # layer index, layer predictions
            b, a, gj, gi = indices[i]  # image, anchor, gridy, gridx
            tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device)  # target obj
            n = b.shape[0]  # number of targets
            if n:
                pxy, pwh, _, pcls, pmask = pi[b, a, gj, gi].split((2, 2, 1, head.nc, nm), 1)  # subset of predictions
                # Box regression
                pxy = pxy.sigmoid() * 2 - 0.5
                pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i]
                pbox = torch.cat((pxy, pwh), 1)  # predicted box
                iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze()  # iou(prediction, target)
                lbox += (1.0 - iou).mean()  # iou loss
                # Objectness
                iou = iou.detach().clamp(0).type(tobj.dtype)
                if sort_obj_iou:
                    j = iou.argsort()
                    b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j]
                if gr < 1:
                    iou = (1.0 - gr) + gr * iou
                tobj[b, a, gj, gi] = iou  # iou ratio
                # Classification
                if head.nc > 1:  # cls loss (only if multiple classes)
                    t = torch.full_like(pcls, cn, device=self.device)  # targets
                    t[range(n), tcls[i]] = cp
                    lcls += BCEcls(pcls, t)  # BCE
                # Mask regression
                if tuple(masks.shape[-2:]) != (mask_h, mask_w):  # downsample
                    masks = F.interpolate(masks[None], (mask_h, mask_w), mode="nearest")[0]
                marea = xywhn[i][:, 2:].prod(1)  # mask width, height normalized
                mxyxy = xywh2xyxy(xywhn[i] * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device))
                for bi in b.unique():
                    j = b == bi  # matching index
                    if self.args.overlap_mask:
                        mask_gti = torch.where(masks[bi][None] == tidxs[i][j].view(-1, 1, 1), 1.0, 0.0)
                    else:
                        mask_gti = masks[tidxs[i]][j]
                    lseg += single_mask_loss(mask_gti, pmask[j], proto[bi], mxyxy[j], marea[j])
            else:
                lseg += (proto * 0).sum()
            obji = BCEobj(pi[..., 4], tobj)
            lobj += obji * balance[i]  # obj loss
            if autobalance:
                balance[i] = balance[i] * 0.9999 + 0.0001 / obji.detach().item()
        if autobalance:
            balance = [x / balance[ssi] for x in balance]
        lbox *= self.args.box
        lobj *= self.args.obj
        lcls *= self.args.cls
        lseg *= self.args.box / bs
        loss = lbox + lobj + lcls + lseg
        return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach()
    def label_loss_items(self, loss_items=None, prefix="train"):
        # We should just use named tensors here in future
        keys = [f"{prefix}/{x}" for x in self.loss_names]
        return dict(zip(keys, loss_items)) if loss_items is not None else keys
    def progress_string(self):
        return ('\n' + '%11s' * 8) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')
    def plot_training_samples(self, batch, ni):
        images = batch["img"]
@ -227,6 +49,129 @@ class SegmentationTrainer(DetectionTrainer):
        plot_results(file=self.csv, segment=True)  # save results.png
 # Criterion class for computing training losses
 class SegLoss:
    def __init__(self, model, overlap=True):  # model must be de-paralleled
        device = next(model.parameters()).device  # get model device
        h = model.args  # hyperparameters
        m = model.model[-1]  # Detect() module
        self.bce = nn.BCEWithLogitsLoss(reduction='none')
        self.hyp = h
        self.stride = m.stride  # model strides
        self.nc = m.nc  # number of classes
        self.no = m.no
        self.nm = m.nm  # number of masks
        self.reg_max = m.reg_max
        self.overlap = overlap
        self.device = device
        self.use_dfl = m.reg_max > 1
        self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
        self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=self.use_dfl).to(device)
        self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device)
    def preprocess(self, targets, batch_size, scale_tensor):
        if targets.shape[0] == 0:
            out = torch.zeros(batch_size, 0, 5, device=self.device)
        else:
            i = targets[:, 0]  # image index
            _, counts = i.unique(return_counts=True)
            out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
            for j in range(batch_size):
                matches = i == j
                n = matches.sum()
                if n:
                    out[j, :n] = targets[matches, 1:]
            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
        return out
    def bbox_decode(self, anchor_points, pred_dist):
        if self.use_dfl:
            b, a, c = pred_dist.shape  # batch, anchors, channels
            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
        return dist2bbox(pred_dist, anchor_points, xywh=False)
    def __call__(self, preds, batch):
        loss = torch.zeros(4, device=self.device)  # box, cls, dfl
        feats, pred_masks, proto = preds if len(preds) == 3 else preds[1]
        batch_size, _, mask_h, mask_w = proto.shape  # batch size, number of masks, mask height, mask width
        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
            (self.reg_max * 4, self.nc), 1)
        # b, grids, ..
        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
        pred_masks = pred_masks.permute(0, 2, 1).contiguous()
        dtype = pred_scores.dtype
        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
        # targets
        batch_idx = batch["batch_idx"].view(-1, 1)
        targets = torch.cat((batch_idx, batch["cls"].view(-1, 1), batch["bboxes"]), 1)
        targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
        masks = batch["masks"].to(self.device).float()
        if tuple(masks.shape[-2:]) != (mask_h, mask_w):  # downsample
            masks = F.interpolate(masks[None], (mask_h, mask_w), mode="nearest")[0]
        # pboxes
        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
        _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner(
            pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
            anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt)
        target_scores_sum = target_scores.sum()
        # cls loss
        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
        loss[2] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
        # bbox loss
        if fg_mask.sum():
            loss[0], loss[3] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes / stride_tensor,
                                              target_scores, target_scores_sum, fg_mask)
            for i in range(batch_size):
                if fg_mask[i].sum():
                    mask_idx = target_gt_idx[i][fg_mask[i]] + 1
                    if self.overlap:
                        gt_mask = torch.where(masks[[i]] == mask_idx.view(-1, 1, 1), 1.0, 0.0)
                    else:
                        gt_mask = masks[batch_idx == i][mask_idx]
                    xyxyn = target_bboxes[i][fg_mask[i]] / imgsz[[1, 0, 1, 0]]
                    marea = xyxy2xywh(xyxyn)[:, 2:].prod(1)
                    mxyxy = xyxyn * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device)
                    loss[1] += self.single_mask_loss(gt_mask, pred_masks[i][fg_mask[i]], proto[i], mxyxy,
                                                     marea)  # seg loss
        # WARNING: Uncomment lines below in case of Multi-GPU DDP unused gradient errors
        #         else:
        #             loss[1] += proto.sum() * 0
        # else:
        #     loss[1] += proto.sum() * 0
        loss[0] *= 7.5  # box gain
        loss[1] *= 7.5 / batch_size  # seg gain
        loss[2] *= 0.5  # cls gain
        loss[3] *= 1.5  # dfl gain
        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
    def single_mask_loss(self, gt_mask, pred, proto, xyxy, area):
        # Mask loss for one image
        pred_mask = (pred @ proto.view(self.nm, -1)).view(-1, *proto.shape[1:])  # (n, 32) @ (32,80,80) -> (n,80,80)
        loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
        return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean()
@hydra.main(version_base=None, config_path=str(DEFAULT_CONFIG.parent), config_name=DEFAULT_CONFIG.name)
 def train(cfg):
    cfg.model = cfg.model or "models/yolov8n-seg.yaml"
--- a/ultralytics/yolo/v8/segment/val.py
+++ b/ultralytics/yolo/v8/segment/val.py
@ -66,7 +66,7 @@ class SegmentationValidator(DetectionValidator):
                                    agnostic=self.args.single_cls,
                                    max_det=self.args.max_det,
                                    nm=self.nm)
-        return p, preds[1], preds[2]
+        return p, preds[1][-1]
    def update_metrics(self, preds, batch):
        # Metrics