Integration of v8 segmentation (#107)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2022-12-28 23:01:38 +08:00
parent 384f0ef1c6
commit 8406b49b49
16 changed files with 422 additions and 224 deletions
--- a/.gitignore
+++ b/.gitignore
@ -133,4 +133,18 @@ datasets/
 runs/
 wandb/

-.DS_Store
+.DS_Store
+
+# Neural Network weights -----------------------------------------------------------------------------------------------
+*.weights
+*.pt
+*.pb
+*.onnx
+*.engine
+*.mlmodel
+*.torchscript
+*.tflite
+*.h5
+*_saved_model/
+*_web_model/
+*_openvino_model/
--- a/tests/check_flops.py
+++ b/tests/check_flops.py
@ -0,0 +1,67 @@
+import torch
+
+from ultralytics import YOLO
+from ultralytics.nn.modules import Detect, Segment
+
+
+def export_onnx(model, file):
+    # YOLOv5 ONNX export
+    import onnx
+    im = torch.zeros(1, 3, 640, 640)
+    model.eval()
+    model(im, profile=True)
+    for k, m in model.named_modules():
+        if isinstance(m, (Detect, Segment)):
+            m.export = True
+
+    torch.onnx.export(
+        model,
+        im,
+        file,
+        verbose=False,
+        opset_version=12,
+        do_constant_folding=True,  # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False
+        input_names=['images'])
+
+    # Checks
+    model_onnx = onnx.load(file)  # load onnx model
+    onnx.checker.check_model(model_onnx)  # check onnx model
+
+    # Metadata
+    d = {'stride': int(max(model.stride)), 'names': model.names}
+    for k, v in d.items():
+        meta = model_onnx.metadata_props.add()
+        meta.key, meta.value = k, str(v)
+    onnx.save(model_onnx, file)
+
+
+if __name__ == "__main__":
+    model = YOLO()
+    print("yolov8n")
+    model.new("yolov8n.yaml")
+    print("yolov8n-seg")
+    model.new("yolov8n-seg.yaml")
+    print("yolov8s")
+    model.new("yolov8s.yaml")
+    # export_onnx(model.model, "yolov8s.onnx")
+    print("yolov8s-seg")
+    model.new("yolov8s-seg.yaml")
+    # export_onnx(model.model, "yolov8s-seg.onnx")
+    print("yolov8m")
+    model.new("yolov8m.yaml")
+    print("yolov8m-seg")
+    model.new("yolov8m-seg.yaml")
+    print("yolov8l")
+    model.new("yolov8l.yaml")
+    print("yolov8l-seg")
+    model.new("yolov8l-seg.yaml")
+    print("yolov8x")
+    model.new("yolov8x.yaml")
+    print("yolov8x-seg")
+    model.new("yolov8x-seg.yaml")
+
+    # n vs n-seg: 8.9GFLOPs vs 12.8GFLOPs, 3.16M vs 3.6M. ch[0] // 4 (11.9GFLOPs, 3.39M)
+    # s vs s-seg: 28.8GFLOPs vs 44.4GFLOPs, 11.1M vs 12.9M. ch[0] // 4 (39.5GFLOPs, 11.7M)
+    # m vs m-seg: 79.3GFLOPs vs 113.8GFLOPs, 25.9M vs 29.5M. ch[0] // 4 (103.GFLOPs, 27.1M)
+    # l vs l-seg: 165.7GFLOPs vs 226.3GFLOPs, 43.7M vs 49.6M. ch[0] // 4 (207GFLOPs, 45.7M)
+    # x vs x-seg: 258.5GFLOPs vs 353.0GFLOPs, 68.3M vs 77.5M. ch[0] // 4 (324GFLOPs, 71.4M)
--- a/ultralytics/nn/modules.py
+++ b/ultralytics/nn/modules.py
@ -576,11 +576,11 @@ class Detections:


 class Proto(nn.Module):
-    # YOLOv5 mask Proto module for segmentation models
+    # YOLOv8 mask Proto module for segmentation models
    def __init__(self, c1, c_=256, c2=32):  # ch_in, number of protos, number of masks
        super().__init__()
        self.cv1 = Conv(c1, c_, k=3)
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
        self.cv2 = Conv(c_, c_, k=3)
        self.cv3 = Conv(c_, c2)

@ -628,16 +628,16 @@ class Detect(nn.Module):
        shape = x[0].shape  # BCHW
        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
-        box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.nc), 1)
        if self.training:
-            return x, box, cls
+            return x
        elif self.dynamic or self.shape != shape:
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape

+        box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.nc), 1)
        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
        y = torch.cat((dbox, cls.sigmoid()), 1)
-        return y if self.export else (y, (x, box, cls))
+        return y if self.export else (y, x)

    def bias_init(self):
        # Initialize Detect() biases, WARNING: requires stride availability
@ -651,19 +651,27 @@ class Detect(nn.Module):

 class Segment(Detect):
    # YOLOv5 Segment head for segmentation models
-    def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=()):
-        super().__init__(nc, anchors, ch)
+    def __init__(self, nc=80, nm=32, npr=256, ch=()):
+        super().__init__(nc, ch)
        self.nm = nm  # number of masks
        self.npr = npr  # number of protos
-        self.no = 5 + nc + self.nm  # number of outputs per anchor
-        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
        self.proto = Proto(ch[0], self.npr, self.nm)  # protos
        self.detect = Detect.forward

+        c4 = max(ch[0] // 4, self.nm)
+        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
+
    def forward(self, x):
        p = self.proto(x[0])
+
+        mc = []  # mask coefficient
+        for i in range(self.nl):
+            mc.append(self.cv4[i](x[i]))
+        mc = torch.cat([mi.view(p.shape[0], self.nm, -1) for mi in mc], 2)
        x = self.detect(self, x)
-        return (x, p) if self.training else (x[0], p) if self.export else (x[0], p, x[1])
+        if self.training:
+            return x, mc, p
+        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))


 class Classify(nn.Module):
--- a/ultralytics/nn/tasks.py
+++ b/ultralytics/nn/tasks.py
@ -101,7 +101,7 @@ class DetectionModel(BaseModel):
        if isinstance(m, (Detect, Segment)):
            s = 256  # 2x min stride
            m.inplace = self.inplace
-            forward = lambda x: self.forward(x)[0] if isinstance(m, (Segment, Detect)) else self.forward(x)
+            forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x)
            m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))])  # forward
            self.stride = m.stride
            m.bias_init()  # only run once
@ -163,8 +163,8 @@ class DetectionModel(BaseModel):

 class SegmentationModel(DetectionModel):
    # YOLOv5 segmentation model
-    def __init__(self, cfg='yolov5s-seg.yaml', ch=3, nc=None):
-        super().__init__(cfg, ch, nc)
+    def __init__(self, cfg='yolov5s-seg.yaml', ch=3, nc=None, verbose=True):
+        super().__init__(cfg, ch, nc, verbose)


 class ClassificationModel(BaseModel):
@ -300,7 +300,7 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
        elif m in {Detect, Segment}:
            args.append([ch[x] for x in f])
            if m is Segment:
-                args[3] = make_divisible(args[3] * gw, 8)
+                args[2] = make_divisible(args[2] * gw, 8)
        else:
            c2 = ch[f]

--- a/ultralytics/yolo/init.py
+++ b/ultralytics/yolo/init.py
@ -0,0 +1 @@
+from . import v8
--- a/ultralytics/yolo/utils/configs/default.yaml
+++ b/ultralytics/yolo/utils/configs/default.yaml
@ -3,7 +3,7 @@

 # Task and Mode
 task: "classify"  # choices=['detect', 'segment', 'classify', 'init'] # init is a special case
-mode: "train"  # choice=['train', 'val', 'infer']
+mode: "train"  # choice=['train', 'val', 'predict']

 # Train settings -------------------------------------------------------------------------------------------------------
 model: null  # i.e. yolov5s.pt, yolo.yaml
--- a/ultralytics/yolo/utils/tal.py
+++ b/ultralytics/yolo/utils/tal.py
@ -86,7 +86,8 @@ class TaskAlignedAssigner(nn.Module):
        if self.n_max_boxes == 0:
            device = gt_bboxes.device
            return (torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), torch.zeros_like(pd_bboxes).to(device),
-                    torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device))
+                    torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device),
+                    torch.zeros_like(pd_scores[..., 0]).to(device))

        mask_pos, align_metric, overlaps = self.get_pos_mask(pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points,
                                                             mask_gt)
@ -103,7 +104,7 @@ class TaskAlignedAssigner(nn.Module):
        norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
        target_scores = target_scores * norm_align_metric

-        return target_labels, target_bboxes, target_scores, fg_mask.bool()
+        return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx

    def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt):
        # get anchor_align metric, (b, max_num_obj, h*w)
@ -146,9 +147,6 @@ class TaskAlignedAssigner(nn.Module):
        # (b, max_num_obj, topk, h*w) -> (b, max_num_obj, h*w)
        is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(-2)
        # filter invalid bboxes
-        # assigned topk should be unique, this is for dealing with empty labels
-        # since empty labels will generate index `0` through `F.one_hot`
-        # NOTE: but what if the topk_idxs include `0`?
        is_in_topk = torch.where(is_in_topk > 1, 0, is_in_topk)
        return is_in_topk.to(metrics.dtype)

--- a/ultralytics/yolo/v8/detect/train.py
+++ b/ultralytics/yolo/v8/detect/train.py
@ -9,11 +9,10 @@ from ultralytics.yolo.data.dataloaders.v5loader import create_dataloader
 from ultralytics.yolo.engine.trainer import DEFAULT_CONFIG, BaseTrainer
 from ultralytics.yolo.utils import colorstr
 from ultralytics.yolo.utils.loss import BboxLoss
-from ultralytics.yolo.utils.metrics import smooth_BCE
 from ultralytics.yolo.utils.ops import xywh2xyxy
 from ultralytics.yolo.utils.plotting import plot_images, plot_results
 from ultralytics.yolo.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors
-from ultralytics.yolo.utils.torch_utils import de_parallel, strip_optimizer
+from ultralytics.yolo.utils.torch_utils import de_parallel


 # BaseTrainer python usage
@ -78,7 +77,8 @@ class DetectionTrainer(BaseTrainer):
        return dict(zip(keys, loss_items)) if loss_items is not None else keys

    def progress_string(self):
-        return ('\n' + '%11s' * 7) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')
+        return ('\n' + '%11s' *
+                (4 + len(self.loss_names))) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')

    def plot_training_samples(self, batch, ni):
        plot_images(images=batch["img"],
@ -100,15 +100,13 @@ class Loss:
        device = next(model.parameters()).device  # get model device
        h = model.args  # hyperparameters

-        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
-        self.cp, self.cn = smooth_BCE(eps=h.get("label_smoothing", 0.0))  # positive, negative BCE targets
-
        m = model.model[-1]  # Detect() module
        self.bce = nn.BCEWithLogitsLoss(reduction='none')
        self.hyp = h
        self.stride = m.stride  # model strides
        self.nc = m.nc  # number of classes
-        self.nl = m.nl  # number of layers
+        self.no = m.no
+        self.reg_max = m.reg_max
        self.device = device

        self.use_dfl = m.reg_max > 1
@ -141,12 +139,15 @@ class Loss:

    def __call__(self, preds, batch):
        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
-        feats, pred_distri, pred_scores = preds if len(preds) == 3 else preds[1]
+        feats = preds[1] if isinstance(preds, tuple) else preds
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+
        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
        pred_distri = pred_distri.permute(0, 2, 1).contiguous()

        dtype = pred_scores.dtype
-        batch_size, grid_size = pred_scores.shape[:2]
+        batch_size = pred_scores.shape[0]
        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)

@ -159,7 +160,7 @@ class Loss:
        # pboxes
        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)

-        target_labels, target_bboxes, target_scores, fg_mask = self.assigner(
+        _, target_bboxes, target_scores, fg_mask, _ = self.assigner(
            pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
            anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt)

--- a/ultralytics/yolo/v8/models/seg/yolov8l-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8l-seg.yaml
@ -0,0 +1,42 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+
+# Parameters
+nc: 80  # number of classes
+depth_multiple: 1.00  # model depth multiple
+width_multiple: 1.00  # layer channel multiple
+
+# YOLOv8.0l backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, C2f, [128, True]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 6, C2f, [256, True]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 6, C2f, [512, True]],
+   [-1, 1, Conv, [512, 3, 2]],  # 7-P5/32
+   [-1, 3, C2f, [512, True]],
+   [-1, 1, SPPF, [512, 5]],  # 9
+  ]
+
+# YOLOv8.0l head
+head:
+  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, C2f, [512]],  # 13
+
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 12], 1, Concat, [1]],  # cat head P4
+   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 9], 1, Concat, [1]],  # cat head P5
+   [-1, 3, C2f, [512]],  # 23 (P5/32-large)
+
+   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
+  ]
--- a/ultralytics/yolo/v8/models/seg/yolov8m-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8m-seg.yaml
@ -0,0 +1,42 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+
+# Parameters
+nc: 80  # number of classes
+depth_multiple: 0.67  # model depth multiple
+width_multiple: 0.75  # layer channel multiple
+
+# YOLOv8.0m backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, C2f, [128, True]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 6, C2f, [256, True]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 6, C2f, [512, True]],
+   [-1, 1, Conv, [768, 3, 2]],  # 7-P5/32
+   [-1, 3, C2f, [768, True]],
+   [-1, 1, SPPF, [768, 5]],  # 9
+  ]
+
+# YOLOv8.0m head
+head:
+  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, C2f, [512]],  # 13
+
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 12], 1, Concat, [1]],  # cat head P4
+   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 9], 1, Concat, [1]],  # cat head P5
+   [-1, 3, C2f, [768]],  # 23 (P5/32-large)
+
+   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
+  ]
--- a/ultralytics/yolo/v8/models/seg/yolov8n-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8n-seg.yaml
@ -4,9 +4,8 @@
 nc: 80  # number of classes
 depth_multiple: 0.33  # model depth multiple
 width_multiple: 0.25  # layer channel multiple
-anchors: [[16,19], [55,65], [178,192]]

-# YOLOv8n v0.0 backbone
+# YOLOv8.0n backbone
 backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
@ -21,7 +20,7 @@ backbone:
   [-1, 1, SPPF, [1024, 5]],  # 9
  ]

-# YOLOv8n v0.0 head
+# YOLOv8.0n head
 head:
  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
--- a/ultralytics/yolo/v8/models/seg/yolov8s-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8s-seg.yaml
@ -0,0 +1,42 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+
+# Parameters
+nc: 80  # number of classes
+depth_multiple: 0.33  # model depth multiple
+width_multiple: 0.50  # layer channel multiple
+
+# YOLOv8.0s backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, C2f, [128, True]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 6, C2f, [256, True]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 6, C2f, [512, True]],
+   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
+   [-1, 3, C2f, [1024, True]],
+   [-1, 1, SPPF, [1024, 5]],  # 9
+  ]
+
+# YOLOv8.0s head
+head:
+  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, C2f, [512]],  # 13
+
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 12], 1, Concat, [1]],  # cat head P4
+   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 9], 1, Concat, [1]],  # cat head P5
+   [-1, 3, C2f, [1024]],  # 23 (P5/32-large)
+
+   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
+  ]
--- a/ultralytics/yolo/v8/models/seg/yolov8x-seg.yaml
+++ b/ultralytics/yolo/v8/models/seg/yolov8x-seg.yaml
@ -0,0 +1,42 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+
+# Parameters
+nc: 80  # number of classes
+depth_multiple: 1.00  # model depth multiple
+width_multiple: 1.25  # layer channel multiple
+
+# YOLOv8.0x backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, C2f, [128, True]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 6, C2f, [256, True]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 6, C2f, [512, True]],
+   [-1, 1, Conv, [512, 3, 2]],  # 7-P5/32
+   [-1, 3, C2f, [512, True]],
+   [-1, 1, SPPF, [512, 5]],  # 9
+  ]
+
+# YOLOv8.0x head
+head:
+  [[-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, C2f, [512]],  # 13
+
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, C2f, [256]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 12], 1, Concat, [1]],  # cat head P4
+   [-1, 3, C2f, [512]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 9], 1, Concat, [1]],  # cat head P5
+   [-1, 3, C2f, [512]],  # 23 (P5/32-large)
+
+   [[15, 18, 21], 1, Segment, [nc, 32, 256]],  # Detect(P3, P4, P5)
+  ]
--- a/ultralytics/yolo/v8/segment/predict.py
+++ b/ultralytics/yolo/v8/segment/predict.py
@ -12,17 +12,14 @@ class SegmentationPredictor(DetectionPredictor):

    def postprocess(self, preds, img, orig_img):
        masks = []
-        if len(preds) == 2:  # eval
-            p, proto, = preds
-        else:  # len(3) train
-            p, proto, _ = preds
        # TODO: filter by classes
-        p = ops.non_max_suppression(p,
+        p = ops.non_max_suppression(preds[0],
                                    self.args.conf_thres,
                                    self.args.iou_thres,
                                    agnostic=self.args.agnostic_nms,
                                    max_det=self.args.max_det,
                                    nm=32)
+        proto = preds[1][-1]
        for i, pred in enumerate(p):
            shape = orig_img[i].shape if self.webcam else orig_img.shape
            if not len(pred):
--- a/ultralytics/yolo/v8/segment/train.py
+++ b/ultralytics/yolo/v8/segment/train.py
@ -6,9 +6,10 @@ import torch.nn.functional as F
 from ultralytics.nn.tasks import SegmentationModel
 from ultralytics.yolo import v8
 from ultralytics.yolo.engine.trainer import DEFAULT_CONFIG
-from ultralytics.yolo.utils.metrics import FocalLoss, bbox_iou, smooth_BCE
-from ultralytics.yolo.utils.ops import crop_mask, xywh2xyxy
+from ultralytics.yolo.utils.loss import BboxLoss
+from ultralytics.yolo.utils.ops import crop_mask, xywh2xyxy, xyxy2xywh
 from ultralytics.yolo.utils.plotting import plot_images, plot_results
+from ultralytics.yolo.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors
 from ultralytics.yolo.utils.torch_utils import de_parallel

 from ..detect import DetectionTrainer
@ -31,188 +32,9 @@ class SegmentationTrainer(DetectionTrainer):
                                                args=self.args)

    def criterion(self, preds, batch):
-        head = de_parallel(self.model).model[-1]
-        sort_obj_iou = False
-        autobalance = False
-
-        # init losses
-        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([self.args.cls_pw], device=self.device))
-        BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([self.args.obj_pw], device=self.device))
-
-        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
-        cp, cn = smooth_BCE(eps=self.args.label_smoothing)  # positive, negative BCE targets
-
-        # Focal loss
-        g = self.args.fl_gamma
-        if self.args.fl_gamma > 0:
-            BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
-
-        balance = {3: [4.0, 1.0, 0.4]}.get(head.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
-        ssi = list(head.stride).index(16) if autobalance else 0  # stride 16 index
-        BCEcls, BCEobj, gr, autobalance = BCEcls, BCEobj, 1.0, autobalance
-
-        def single_mask_loss(gt_mask, pred, proto, xyxy, area):
-            # Mask loss for one image
-            pred_mask = (pred @ proto.view(head.nm, -1)).view(-1, *proto.shape[1:])  # (n,32) @ (32,80,80) -> (n,80,80)
-            loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
-            return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean()
-
-        def build_targets(p, targets):
-            # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
-            nonlocal head
-            na, nt = head.na, targets.shape[0]  # number of anchors, targets
-            tcls, tbox, indices, anch, tidxs, xywhn = [], [], [], [], [], []
-            gain = torch.ones(8, device=self.device)  # normalized to gridspace gain
-            ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1,
-                                                                                 nt)  # same as .repeat_interleave(nt)
-            if self.args.overlap_mask:
-                batch = p[0].shape[0]
-                ti = []
-                for i in range(batch):
-                    num = (targets[:, 0] == i).sum()  # find number of targets of each image
-                    ti.append(torch.arange(num, device=self.device).float().view(1, num).repeat(na, 1) + 1)  # (na, num)
-                ti = torch.cat(ti, 1)  # (na, nt)
-            else:
-                ti = torch.arange(nt, device=self.device).float().view(1, nt).repeat(na, 1)
-            targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None], ti[..., None]), 2)  # append anchor indices
-
-            g = 0.5  # bias
-            off = torch.tensor(
-                [
-                    [0, 0],
-                    [1, 0],
-                    [0, 1],
-                    [-1, 0],
-                    [0, -1],  # j,k,l,m
-                    # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
-                ],
-                device=self.device).float() * g  # offsets
-
-            for i in range(head.nl):
-                anchors, shape = head.anchors[i], p[i].shape
-                gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]]  # xyxy gain
-
-                # Match targets to anchors
-                t = targets * gain  # shape(3,n,7)
-                if nt:
-                    # Matches
-                    r = t[..., 4:6] / anchors[:, None]  # wh ratio
-                    j = torch.max(r, 1 / r).max(2)[0] < self.args.anchor_t  # compare
-                    # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
-                    t = t[j]  # filter
-
-                    # Offsets
-                    gxy = t[:, 2:4]  # grid xy
-                    gxi = gain[[2, 3]] - gxy  # inverse
-                    j, k = ((gxy % 1 < g) & (gxy > 1)).T
-                    l, m = ((gxi % 1 < g) & (gxi > 1)).T
-                    j = torch.stack((torch.ones_like(j), j, k, l, m))
-                    t = t.repeat((5, 1, 1))[j]
-                    offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
-                else:
-                    t = targets[0]
-                    offsets = 0
-
-                # Define
-                bc, gxy, gwh, at = t.chunk(4, 1)  # (image, class), grid xy, grid wh, anchors
-                (a, tidx), (b, c) = at.long().T, bc.long().T  # anchors, image, class
-                gij = (gxy - offsets).long()
-                gi, gj = gij.T  # grid indices
-
-                # Append
-                indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1)))  # image, anchor, grid
-                tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
-                anch.append(anchors[a])  # anchors
-                tcls.append(c)  # class
-                tidxs.append(tidx)
-                xywhn.append(torch.cat((gxy, gwh), 1) / gain[2:6])  # xywh normalized
-
-            return tcls, tbox, indices, anch, tidxs, xywhn
-
-        if len(preds) == 2:  # eval
-            p, proto, = preds
-        else:  # len(3) train
-            _, proto, p = preds
-
-        targets = torch.cat((batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), 1)
-        masks = batch["masks"]
-        targets, masks = targets.to(self.device), masks.to(self.device).float()
-
-        bs, nm, mask_h, mask_w = proto.shape  # batch size, number of masks, mask height, mask width
-        lcls = torch.zeros(1, device=self.device)
-        lbox = torch.zeros(1, device=self.device)
-        lobj = torch.zeros(1, device=self.device)
-        lseg = torch.zeros(1, device=self.device)
-        tcls, tbox, indices, anchors, tidxs, xywhn = build_targets(p, targets)
-
-        # Losses
-        for i, pi in enumerate(p):  # layer index, layer predictions
-            b, a, gj, gi = indices[i]  # image, anchor, gridy, gridx
-            tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device)  # target obj
-
-            n = b.shape[0]  # number of targets
-            if n:
-                pxy, pwh, _, pcls, pmask = pi[b, a, gj, gi].split((2, 2, 1, head.nc, nm), 1)  # subset of predictions
-
-                # Box regression
-                pxy = pxy.sigmoid() * 2 - 0.5
-                pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i]
-                pbox = torch.cat((pxy, pwh), 1)  # predicted box
-                iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze()  # iou(prediction, target)
-                lbox += (1.0 - iou).mean()  # iou loss
-
-                # Objectness
-                iou = iou.detach().clamp(0).type(tobj.dtype)
-                if sort_obj_iou:
-                    j = iou.argsort()
-                    b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j]
-                if gr < 1:
-                    iou = (1.0 - gr) + gr * iou
-                tobj[b, a, gj, gi] = iou  # iou ratio
-
-                # Classification
-                if head.nc > 1:  # cls loss (only if multiple classes)
-                    t = torch.full_like(pcls, cn, device=self.device)  # targets
-                    t[range(n), tcls[i]] = cp
-                    lcls += BCEcls(pcls, t)  # BCE
-
-                # Mask regression
-                if tuple(masks.shape[-2:]) != (mask_h, mask_w):  # downsample
-                    masks = F.interpolate(masks[None], (mask_h, mask_w), mode="nearest")[0]
-                marea = xywhn[i][:, 2:].prod(1)  # mask width, height normalized
-                mxyxy = xywh2xyxy(xywhn[i] * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device))
-                for bi in b.unique():
-                    j = b == bi  # matching index
-                    if self.args.overlap_mask:
-                        mask_gti = torch.where(masks[bi][None] == tidxs[i][j].view(-1, 1, 1), 1.0, 0.0)
-                    else:
-                        mask_gti = masks[tidxs[i]][j]
-                    lseg += single_mask_loss(mask_gti, pmask[j], proto[bi], mxyxy[j], marea[j])
-            else:
-                lseg += (proto * 0).sum()
-
-            obji = BCEobj(pi[..., 4], tobj)
-            lobj += obji * balance[i]  # obj loss
-            if autobalance:
-                balance[i] = balance[i] * 0.9999 + 0.0001 / obji.detach().item()
-
-        if autobalance:
-            balance = [x / balance[ssi] for x in balance]
-        lbox *= self.args.box
-        lobj *= self.args.obj
-        lcls *= self.args.cls
-        lseg *= self.args.box / bs
-
-        loss = lbox + lobj + lcls + lseg
-        return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach()
-
-    def label_loss_items(self, loss_items=None, prefix="train"):
-        # We should just use named tensors here in future
-        keys = [f"{prefix}/{x}" for x in self.loss_names]
-        return dict(zip(keys, loss_items)) if loss_items is not None else keys
-
-    def progress_string(self):
-        return ('\n' + '%11s' * 8) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')
+        if not hasattr(self, 'compute_loss'):
+            self.compute_loss = SegLoss(de_parallel(self.model), overlap=self.args.overlap_mask)
+        return self.compute_loss(preds, batch)

    def plot_training_samples(self, batch, ni):
        images = batch["img"]
@ -227,6 +49,129 @@ class SegmentationTrainer(DetectionTrainer):
        plot_results(file=self.csv, segment=True)  # save results.png


+# Criterion class for computing training losses
+class SegLoss:
+
+    def __init__(self, model, overlap=True):  # model must be de-paralleled
+
+        device = next(model.parameters()).device  # get model device
+        h = model.args  # hyperparameters
+
+        m = model.model[-1]  # Detect() module
+        self.bce = nn.BCEWithLogitsLoss(reduction='none')
+        self.hyp = h
+        self.stride = m.stride  # model strides
+        self.nc = m.nc  # number of classes
+        self.no = m.no
+        self.nm = m.nm  # number of masks
+        self.reg_max = m.reg_max
+        self.overlap = overlap
+        self.device = device
+
+        self.use_dfl = m.reg_max > 1
+        self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
+        self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=self.use_dfl).to(device)
+        self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device)
+
+    def preprocess(self, targets, batch_size, scale_tensor):
+        if targets.shape[0] == 0:
+            out = torch.zeros(batch_size, 0, 5, device=self.device)
+        else:
+            i = targets[:, 0]  # image index
+            _, counts = i.unique(return_counts=True)
+            out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
+            for j in range(batch_size):
+                matches = i == j
+                n = matches.sum()
+                if n:
+                    out[j, :n] = targets[matches, 1:]
+            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
+        return out
+
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            b, a, c = pred_dist.shape  # batch, anchors, channels
+            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
+        return dist2bbox(pred_dist, anchor_points, xywh=False)
+
+    def __call__(self, preds, batch):
+        loss = torch.zeros(4, device=self.device)  # box, cls, dfl
+        feats, pred_masks, proto = preds if len(preds) == 3 else preds[1]
+        batch_size, _, mask_h, mask_w = proto.shape  # batch size, number of masks, mask height, mask width
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+
+        # b, grids, ..
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_masks = pred_masks.permute(0, 2, 1).contiguous()
+
+        dtype = pred_scores.dtype
+        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
+        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
+
+        # targets
+        batch_idx = batch["batch_idx"].view(-1, 1)
+        targets = torch.cat((batch_idx, batch["cls"].view(-1, 1), batch["bboxes"]), 1)
+        targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
+        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
+        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+
+        masks = batch["masks"].to(self.device).float()
+        if tuple(masks.shape[-2:]) != (mask_h, mask_w):  # downsample
+            masks = F.interpolate(masks[None], (mask_h, mask_w), mode="nearest")[0]
+
+        # pboxes
+        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
+
+        _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner(
+            pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt)
+
+        target_scores_sum = target_scores.sum()
+
+        # cls loss
+        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
+        loss[2] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
+
+        # bbox loss
+        if fg_mask.sum():
+            loss[0], loss[3] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes / stride_tensor,
+                                              target_scores, target_scores_sum, fg_mask)
+            for i in range(batch_size):
+                if fg_mask[i].sum():
+                    mask_idx = target_gt_idx[i][fg_mask[i]] + 1
+                    if self.overlap:
+                        gt_mask = torch.where(masks[[i]] == mask_idx.view(-1, 1, 1), 1.0, 0.0)
+                    else:
+                        gt_mask = masks[batch_idx == i][mask_idx]
+                    xyxyn = target_bboxes[i][fg_mask[i]] / imgsz[[1, 0, 1, 0]]
+                    marea = xyxy2xywh(xyxyn)[:, 2:].prod(1)
+                    mxyxy = xyxyn * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device)
+                    loss[1] += self.single_mask_loss(gt_mask, pred_masks[i][fg_mask[i]], proto[i], mxyxy,
+                                                     marea)  # seg loss
+        # WARNING: Uncomment lines below in case of Multi-GPU DDP unused gradient errors
+        #         else:
+        #             loss[1] += proto.sum() * 0
+        # else:
+        #     loss[1] += proto.sum() * 0
+
+        loss[0] *= 7.5  # box gain
+        loss[1] *= 7.5 / batch_size  # seg gain
+        loss[2] *= 0.5  # cls gain
+        loss[3] *= 1.5  # dfl gain
+
+        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
+
+    def single_mask_loss(self, gt_mask, pred, proto, xyxy, area):
+        # Mask loss for one image
+        pred_mask = (pred @ proto.view(self.nm, -1)).view(-1, *proto.shape[1:])  # (n, 32) @ (32,80,80) -> (n,80,80)
+        loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
+        return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean()
+
+
@hydra.main(version_base=None, config_path=str(DEFAULT_CONFIG.parent), config_name=DEFAULT_CONFIG.name)
 def train(cfg):
    cfg.model = cfg.model or "models/yolov8n-seg.yaml"
--- a/ultralytics/yolo/v8/segment/val.py
+++ b/ultralytics/yolo/v8/segment/val.py
@ -66,7 +66,7 @@ class SegmentationValidator(DetectionValidator):
                                    agnostic=self.args.single_cls,
                                    max_det=self.args.max_det,
                                    nm=self.nm)
-        return p, preds[1], preds[2]
+        return p, preds[1][-1]

    def update_metrics(self, preds, batch):
        # Metrics