ultralytics 8.0.65 YOLOv8 Pose models (#1347)

Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Mert Can Demir <validatedev@gmail.com> Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com> Co-authored-by: Fabian Greavu <fabiangreavu@gmail.com> Co-authored-by: Yonghye Kwon <developer.0hye@gmail.com> Co-authored-by: Eric Pedley <ericpedley@gmail.com> Co-authored-by: JustasBart <40023722+JustasBart@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Aarni Koskela <akx@iki.fi> Co-authored-by: Sergio Sanchez <sergio.ssm.97@gmail.com> Co-authored-by: Bogdan Gheorghe <112427971+bogdan-galileo@users.noreply.github.com> Co-authored-by: Jaap van de Loosdrecht <jaap@vdlmv.nl> Co-authored-by: Noobtoss <96134731+Noobtoss@users.noreply.github.com> Co-authored-by: nerdyespresso <106761627+nerdyespresso@users.noreply.github.com> Co-authored-by: Farid Inawan <frdteknikelektro@gmail.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: Alexander Duda <Alexander.Duda@me.com> Co-authored-by: Mehran Ghandehari <mehran.maps@gmail.com> Co-authored-by: Snyk bot <snyk-bot@snyk.io> Co-authored-by: majid nasiri <majnasai@gmail.com>
2023-04-06 03:55:32 +05:30
parent 9af3e69b1a
commit 1cb92d7f42
57 changed files with 1578 additions and 489 deletions
--- a/ultralytics/init.py
+++ b/ultralytics/init.py
@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, GPL-3.0 license

-__version__ = '8.0.65'
+__version__ = '8.0.66'

 from ultralytics.hub import start
 from ultralytics.yolo.engine.model import YOLO
--- a/ultralytics/datasets/coco-pose.yaml
+++ b/ultralytics/datasets/coco-pose.yaml
@ -0,0 +1,38 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+# COCO 2017 dataset http://cocodataset.org by Microsoft
+# Example usage: yolo train data=coco-pose.yaml
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── coco-pose  ← downloads here (20.1 GB)
+
+
+# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
+path: ../datasets/coco-pose  # dataset root dir
+train: train2017.txt  # train images (relative to 'path') 118287 images
+val: val2017.txt  # val images (relative to 'path') 5000 images
+test: test-dev2017.txt  # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794
+
+# Keypoints
+kpt_shape: [17, 3]  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+flip_idx: [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+
+# Classes
+names:
+  0: person
+
+# Download script/URL (optional)
+download: |
+  from ultralytics.yolo.utils.downloads import download
+  from pathlib import Path
+
+  # Download labels
+  dir = Path(yaml['path'])  # dataset root dir
+  url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/'
+  urls = [url + 'coco2017labels-pose.zip']  # labels
+  download(urls, dir=dir.parent)
+  # Download data
+  urls = ['http://images.cocodataset.org/zips/train2017.zip',  # 19G, 118k images
+          'http://images.cocodataset.org/zips/val2017.zip',  # 1G, 5k images
+          'http://images.cocodataset.org/zips/test2017.zip']  # 7G, 41k images (optional)
+  download(urls, dir=dir / 'images', threads=3)
--- a/ultralytics/datasets/coco8-pose.yaml
+++ b/ultralytics/datasets/coco8-pose.yaml
@ -0,0 +1,25 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+# COCO8-pose dataset (first 8 images from COCO train2017) by Ultralytics
+# Example usage: yolo train data=coco8-pose.yaml
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── coco8-pose  ← downloads here (1 MB)
+
+
+# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
+path: ../datasets/coco8-pose  # dataset root dir
+train: images/train  # train images (relative to 'path') 4 images
+val: images/val  # val images (relative to 'path') 4 images
+test:  # test images (optional)
+
+# Keypoints
+kpt_shape: [17, 3]  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+flip_idx: [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+
+# Classes
+names:
+  0: person
+
+# Download script/URL (optional)
+download: https://ultralytics.com/assets/coco8-pose.zip
--- a/ultralytics/models/README.md
+++ b/ultralytics/models/README.md
@ -44,13 +44,14 @@ Any of these models can be used by loading their configs or pretrained checkpoin

 ### 1. YOLOv8

-**About** - Cutting edge Detection, Segmentation and Classification models developed by Ultralytics. </br>
+**About** - Cutting edge Detection, Segmentation, Classification and Pose models developed by Ultralytics. </br>

 Available Models:

 - Detection - `yolov8n`, `yolov8s`, `yolov8m`, `yolov8l`, `yolov8x`
 - Instance Segmentation - `yolov8n-seg`, `yolov8s-seg`, `yolov8m-seg`, `yolov8l-seg`, `yolov8x-seg`
 - Classification - `yolov8n-cls`, `yolov8s-cls`, `yolov8m-cls`, `yolov8l-cls`, `yolov8x-cls`
+- Pose - `yolov8n-pose`, `yolov8s-pose`, `yolov8m-pose`, `yolov8l-pose`, `yolov8x-pose`, `yolov8x-pose-p6`

 <details><summary>Performance</summary>

@ -84,6 +85,17 @@ Available Models:
 | [YOLOv8l-cls](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l-cls.pt) | 224                   | 78.0             | 94.1             | 163.0                          | 0.87                                | 37.5               | 99.7                     |
 | [YOLOv8x-cls](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8x-cls.pt) | 224                   | 78.4             | 94.3             | 232.0                          | 1.01                                | 57.4               | 154.8                    |

+### Pose
+
+| Model                                                                                                | size<br><sup>(pixels) | mAP<sup>box<br>50-95 | mAP<sup>pose<br>50-95 | Speed<br><sup>CPU ONNX<br>(ms) | Speed<br><sup>A100 TensorRT<br>(ms) | params<br><sup>(M) | FLOPs<br><sup>(B) |
+| ---------------------------------------------------------------------------------------------------- | --------------------- | -------------------- | --------------------- | ------------------------------ | ----------------------------------- | ------------------ | ----------------- |
+| [YOLOv8n-pose](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n-pose.pt)       | 640                   | -                    | 49.7                  | -                              | -                                   | 3.3                | 9.2               |
+| [YOLOv8s-pose](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8s-pose.pt)       | 640                   | -                    | 59.2                  | -                              | -                                   | 11.6               | 30.2              |
+| [YOLOv8m-pose](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8m-pose.pt)       | 640                   | -                    | 63.6                  | -                              | -                                   | 26.4               | 81.0              |
+| [YOLOv8l-pose](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l-pose.pt)       | 640                   | -                    | 67.0                  | -                              | -                                   | 44.4               | 168.6             |
+| [YOLOv8x-pose](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8x-pose.pt)       | 640                   | -                    | 68.9                  | -                              | -                                   | 69.4               | 263.2             |
+| [YOLOv8x-pose-p6](https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8x-pose-p6.pt) | 1280                  | -                    | 71.5                  | -                              | -                                   | 99.1               | 1066.4            |
+
 </details>

 ### 2. YOLOv5u
--- a/ultralytics/models/v8/yolov8-pose-p6.yaml
+++ b/ultralytics/models/v8/yolov8-pose-p6.yaml
@ -0,0 +1,57 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+# YOLOv8 object detection model with P3-P6 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
+
+# Parameters
+nc: 1  # number of classes
+kpt_shape: [17, 3]  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+scales: # model compound scaling constants, i.e. 'model=yolov8n-p6.yaml' will call yolov8-p6.yaml with scale 'n'
+  # [depth, width, max_channels]
+  n: [0.33, 0.25, 1024]
+  s: [0.33, 0.50, 1024]
+  m: [0.67, 0.75, 768]
+  l: [1.00, 1.00, 512]
+  x: [1.00, 1.25, 512]
+
+# YOLOv8.0x6 backbone
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv, [64, 3, 2]]  # 0-P1/2
+  - [-1, 1, Conv, [128, 3, 2]]  # 1-P2/4
+  - [-1, 3, C2f, [128, True]]
+  - [-1, 1, Conv, [256, 3, 2]]  # 3-P3/8
+  - [-1, 6, C2f, [256, True]]
+  - [-1, 1, Conv, [512, 3, 2]]  # 5-P4/16
+  - [-1, 6, C2f, [512, True]]
+  - [-1, 1, Conv, [768, 3, 2]]  # 7-P5/32
+  - [-1, 3, C2f, [768, True]]
+  - [-1, 1, Conv, [1024, 3, 2]]  # 9-P6/64
+  - [-1, 3, C2f, [1024, True]]
+  - [-1, 1, SPPF, [1024, 5]]  # 11
+
+# YOLOv8.0x6 head
+head:
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [[-1, 8], 1, Concat, [1]]  # cat backbone P5
+  - [-1, 3, C2, [768, False]]  # 14
+
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [[-1, 6], 1, Concat, [1]]  # cat backbone P4
+  - [-1, 3, C2, [512, False]]  # 17
+
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [[-1, 4], 1, Concat, [1]]  # cat backbone P3
+  - [-1, 3, C2, [256, False]]  # 20 (P3/8-small)
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 17], 1, Concat, [1]]  # cat head P4
+  - [-1, 3, C2, [512, False]]  # 23 (P4/16-medium)
+
+  - [-1, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]  # cat head P5
+  - [-1, 3, C2, [768, False]]  # 26 (P5/32-large)
+
+  - [-1, 1, Conv, [768, 3, 2]]
+  - [[-1, 11], 1, Concat, [1]]  # cat head P6
+  - [-1, 3, C2, [1024, False]]  # 29 (P6/64-xlarge)
+
+  - [[20, 23, 26, 29], 1, Pose, [nc, kpt_shape]]  # Pose(P3, P4, P5, P6)
--- a/ultralytics/models/v8/yolov8-pose.yaml
+++ b/ultralytics/models/v8/yolov8-pose.yaml
@ -0,0 +1,47 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+# YOLOv8-pose keypoints/pose estimation model. For Usage examples see https://docs.ultralytics.com/tasks/pose
+
+# Parameters
+nc: 1  # number of classes
+kpt_shape: [17, 3]  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+scales: # model compound scaling constants, i.e. 'model=yolov8n-pose.yaml' will call yolov8-pose.yaml with scale 'n'
+  # [depth, width, max_channels]
+  n: [0.33, 0.25, 1024]
+  s: [0.33, 0.50, 1024]
+  m: [0.67, 0.75, 768]
+  l: [1.00, 1.00, 512]
+  x: [1.00, 1.25, 512]
+
+# YOLOv8.0n backbone
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv, [64, 3, 2]]  # 0-P1/2
+  - [-1, 1, Conv, [128, 3, 2]]  # 1-P2/4
+  - [-1, 3, C2f, [128, True]]
+  - [-1, 1, Conv, [256, 3, 2]]  # 3-P3/8
+  - [-1, 6, C2f, [256, True]]
+  - [-1, 1, Conv, [512, 3, 2]]  # 5-P4/16
+  - [-1, 6, C2f, [512, True]]
+  - [-1, 1, Conv, [1024, 3, 2]]  # 7-P5/32
+  - [-1, 3, C2f, [1024, True]]
+  - [-1, 1, SPPF, [1024, 5]]  # 9
+
+# YOLOv8.0n head
+head:
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [[-1, 6], 1, Concat, [1]]  # cat backbone P4
+  - [-1, 3, C2f, [512]]  # 12
+
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [[-1, 4], 1, Concat, [1]]  # cat backbone P3
+  - [-1, 3, C2f, [256]]  # 15 (P3/8-small)
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 12], 1, Concat, [1]]  # cat head P4
+  - [-1, 3, C2f, [512]]  # 18 (P4/16-medium)
+
+  - [-1, 1, Conv, [512, 3, 2]]
+  - [[-1, 9], 1, Concat, [1]]  # cat head P5
+  - [-1, 3, C2f, [1024]]  # 21 (P5/32-large)
+
+  - [[15, 18, 21], 1, Pose, [nc, kpt_shape]]  # Pose(P3, P4, P5)
--- a/ultralytics/nn/autobackend.py
+++ b/ultralytics/nn/autobackend.py
@ -91,8 +91,10 @@ class AutoBackend(nn.Module):
        if nn_module:
            model = weights.to(device)
            model = model.fuse(verbose=verbose) if fuse else model
-            names = model.module.names if hasattr(model, 'module') else model.names  # get class names
+            if hasattr(model, 'kpt_shape'):
+                kpt_shape = model.kpt_shape  # pose-only
            stride = max(int(model.stride.max()), 32)  # model stride
+            names = model.module.names if hasattr(model, 'module') else model.names  # get class names
            model.half() if fp16 else model.float()
            self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
            pt = True
@ -102,6 +104,8 @@ class AutoBackend(nn.Module):
                                         device=device,
                                         inplace=True,
                                         fuse=fuse)
+            if hasattr(model, 'kpt_shape'):
+                kpt_shape = model.kpt_shape  # pose-only
            stride = max(int(model.stride.max()), 32)  # model stride
            names = model.module.names if hasattr(model, 'module') else model.names  # get class names
            model.half() if fp16 else model.float()
@ -268,13 +272,14 @@ class AutoBackend(nn.Module):
            for k, v in metadata.items():
                if k in ('stride', 'batch'):
                    metadata[k] = int(v)
-                elif k in ('imgsz', 'names') and isinstance(v, str):
+                elif k in ('imgsz', 'names', 'kpt_shape') and isinstance(v, str):
                    metadata[k] = eval(v)
            stride = metadata['stride']
            task = metadata['task']
            batch = metadata['batch']
            imgsz = metadata['imgsz']
            names = metadata['names']
+            kpt_shape = metadata.get('kpt_shape')
        elif not (pt or triton or nn_module):
            LOGGER.warning(f"WARNING ⚠️ Metadata not found for 'model={weights}'")

--- a/ultralytics/nn/modules.py
+++ b/ultralytics/nn/modules.py
@ -378,7 +378,9 @@ class Ensemble(nn.ModuleList):
        return y, None  # inference, train output


-# heads
+# Model heads below ----------------------------------------------------------------------------------------------------
+
+
 class Detect(nn.Module):
    # YOLOv8 Detect head for detection models
    dynamic = False  # force grid reconstruction
@ -394,7 +396,6 @@ class Detect(nn.Module):
        self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
        self.no = nc + self.reg_max * 4  # number of outputs per anchor
        self.stride = torch.zeros(self.nl)  # strides computed during build
-
        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc)  # channels
        self.cv2 = nn.ModuleList(
            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
@ -454,6 +455,36 @@ class Segment(Detect):
        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))


+class Pose(Detect):
+    # YOLOv8 Pose head for keypoints models
+    def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
+        super().__init__(nc, ch)
+        self.kpt_shape = kpt_shape  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+        self.nk = kpt_shape[0] * kpt_shape[1]  # number of keypoints total
+        self.detect = Detect.forward
+
+        c4 = max(ch[0] // 4, self.nk)
+        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)
+
+    def forward(self, x):
+        bs = x[0].shape[0]  # batch size
+        kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1)  # (bs, 17*3, h*w)
+        x = self.detect(self, x)
+        if self.training:
+            return x, kpt
+        pred_kpt = self.kpts_decode(kpt)
+        return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))
+
+    def kpts_decode(self, kpts):
+        ndim = self.kpt_shape[1]
+        y = kpts.clone()
+        if ndim == 3:
+            y[:, 2::3].sigmoid_()  # inplace sigmoid
+        y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
+        y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
+        return y
+
+
 class Classify(nn.Module):
    # YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2)
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
--- a/ultralytics/nn/tasks.py
+++ b/ultralytics/nn/tasks.py
@ -10,7 +10,7 @@ import torch.nn as nn

 from ultralytics.nn.modules import (C1, C2, C3, C3TR, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, Classify,
                                    Concat, Conv, ConvTranspose, Detect, DWConv, DWConvTranspose2d, Ensemble, Focus,
-                                    GhostBottleneck, GhostConv, Segment)
+                                    GhostBottleneck, GhostConv, Pose, Segment)
 from ultralytics.yolo.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
 from ultralytics.yolo.utils.checks import check_requirements, check_suffix, check_yaml
 from ultralytics.yolo.utils.torch_utils import (fuse_conv_and_bn, fuse_deconv_and_bn, initialize_weights,
@ -183,10 +183,10 @@ class DetectionModel(BaseModel):

        # Build strides
        m = self.model[-1]  # Detect()
-        if isinstance(m, (Detect, Segment)):
+        if isinstance(m, (Detect, Segment, Pose)):
            s = 256  # 2x min stride
            m.inplace = self.inplace
-            forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x)
+            forward = lambda x: self.forward(x)[0] if isinstance(m, (Segment, Pose)) else self.forward(x)
            m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))])  # forward
            self.stride = m.stride
            m.bias_init()  # only run once
@ -242,12 +242,23 @@ class DetectionModel(BaseModel):
 class SegmentationModel(DetectionModel):
    # YOLOv8 segmentation model
    def __init__(self, cfg='yolov8n-seg.yaml', ch=3, nc=None, verbose=True):
-        super().__init__(cfg, ch, nc, verbose)
+        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

    def _forward_augment(self, x):
        raise NotImplementedError(emojis('WARNING ⚠️ SegmentationModel has not supported augment inference yet!'))


+class PoseModel(DetectionModel):
+    # YOLOv8 pose model
+    def __init__(self, cfg='yolov8n-pose.yaml', ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
+        if not isinstance(cfg, dict):
+            cfg = yaml_model_load(cfg)  # load model YAML
+        if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg['kpt_shape']):
+            LOGGER.info(f"Overriding model.yaml kpt_shape={cfg['kpt_shape']} with kpt_shape={data_kpt_shape}")
+            cfg['kpt_shape'] = data_kpt_shape
+        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
+
+
 class ClassificationModel(BaseModel):
    # YOLOv8 classification model
    def __init__(self,
@ -425,7 +436,7 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
    # Args
    max_channels = float('inf')
    nc, act, scales = (d.get(x) for x in ('nc', 'act', 'scales'))
-    depth, width = (d.get(x, 1.0) for x in ('depth_multiple', 'width_multiple'))
+    depth, width, kpt_shape = (d.get(x, 1.0) for x in ('depth_multiple', 'width_multiple', 'kpt_shape'))
    if scales:
        scale = d.get('scale')
        if not scale:
@ -464,7 +475,7 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
            args = [ch[f]]
        elif m is Concat:
            c2 = sum(ch[x] for x in f)
-        elif m in (Detect, Segment):
+        elif m in (Detect, Segment, Pose):
            args.append([ch[x] for x in f])
            if m is Segment:
                args[2] = make_divisible(min(args[2], max_channels) * width, 8)
@ -543,6 +554,8 @@ def guess_model_task(model):
            return 'detect'
        if m == 'segment':
            return 'segment'
+        if m == 'pose':
+            return 'pose'

    # Guess from model cfg
    if isinstance(model, dict):
@ -565,6 +578,8 @@ def guess_model_task(model):
                return 'segment'
            elif isinstance(m, Classify):
                return 'classify'
+            elif isinstance(m, Pose):
+                return 'pose'

    # Guess from model filename
    if isinstance(model, (str, Path)):
@ -573,10 +588,12 @@ def guess_model_task(model):
            return 'segment'
        elif '-cls' in model.stem or 'classify' in model.parts:
            return 'classify'
+        elif '-pose' in model.stem or 'pose' in model.parts:
+            return 'pose'
        elif 'detect' in model.parts:
            return 'detect'

    # Unable to determine task from model
    LOGGER.warning("WARNING ⚠️ Unable to automatically guess model task, assuming 'task=detect'. "
-                   "Explicitly define task for your model, i.e. 'task=detect', 'task=segment' or 'task=classify'.")
+                   "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify', or 'pose'.")
    return 'detect'  # assume detect
--- a/ultralytics/tracker/track.py
+++ b/ultralytics/tracker/track.py
@ -33,10 +33,9 @@ def on_predict_postprocess_end(predictor):
        tracks = predictor.trackers[i].update(det, im0s[i])
        if len(tracks) == 0:
            continue
+        idx = tracks[:, -1].tolist()
+        predictor.results[i] = predictor.results[i][idx]
        predictor.results[i].update(boxes=torch.as_tensor(tracks[:, :-1]))
-        if predictor.results[i].masks is not None:
-            idx = tracks[:, -1].tolist()
-            predictor.results[i].masks = predictor.results[i].masks[idx]


 def register_tracker(model):
--- a/ultralytics/yolo/cfg/init.py
+++ b/ultralytics/yolo/cfg/init.py
@ -18,13 +18,13 @@ TASKS = 'detect', 'segment', 'classify', 'pose'
 TASK2DATA = {
    'detect': 'coco128.yaml',
    'segment': 'coco128-seg.yaml',
-    'pose': 'coco128-pose.yaml',
-    'classify': 'imagenet100'}
+    'classify': 'imagenet100',
+    'pose': 'coco128-pose.yaml'}
 TASK2MODEL = {
    'detect': 'yolov8n.pt',
    'segment': 'yolov8n-seg.pt',
-    'pose': 'yolov8n-pose.yaml',
-    'classify': 'yolov8n-cls.pt'}  # temp
+    'classify': 'yolov8n-cls.pt',
+    'pose': 'yolov8n-pose.yaml'}

 CLI_HELP_MSG = \
    f"""
--- a/ultralytics/yolo/cfg/default.yaml
+++ b/ultralytics/yolo/cfg/default.yaml
@ -88,6 +88,8 @@ warmup_bias_lr: 0.1  # warmup initial bias lr
 box: 7.5  # box loss gain
 cls: 0.5  # cls loss gain (scale with pixels)
 dfl: 1.5  # dfl loss gain
+pose: 12.0  # pose loss gain
+kobj: 1.0  # keypoint obj loss gain
 fl_gamma: 0.0  # focal loss gamma (efficientDet default gamma=1.5)
 label_smoothing: 0.0  # label smoothing (fraction)
 nbs: 64  # nominal batch size
--- a/ultralytics/yolo/data/augment.py
+++ b/ultralytics/yolo/data/augment.py
@ -16,6 +16,8 @@ from ..utils.metrics import bbox_ioa
 from ..utils.ops import segment2box
 from .utils import polygons2masks, polygons2masks_overlap

+POSE_FLIPLR_INDEX = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+

 # TODO: we might need a BaseTransform to make all these augments be compatible with both classification and semantic
 class BaseTransform:
@ -309,27 +311,22 @@ class RandomPerspective:
        """apply affine to keypoints.

        Args:
-            keypoints(ndarray): keypoints, [N, 17, 2].
+            keypoints(ndarray): keypoints, [N, 17, 3].
            M(ndarray): affine matrix.
        Return:
-            new_keypoints(ndarray): keypoints after affine, [N, 17, 2].
+            new_keypoints(ndarray): keypoints after affine, [N, 17, 3].
        """
-        n = len(keypoints)
+        n, nkpt = keypoints.shape[:2]
        if n == 0:
            return keypoints
-        new_keypoints = np.ones((n * 17, 3))
-        new_keypoints[:, :2] = keypoints.reshape(n * 17, 2)  # num_kpt is hardcoded to 17
-        new_keypoints = new_keypoints @ M.T  # transform
-        new_keypoints = (new_keypoints[:, :2] / new_keypoints[:, 2:3]).reshape(n, 34)  # perspective rescale or affine
-        new_keypoints[keypoints.reshape(-1, 34) == 0] = 0
-        x_kpts = new_keypoints[:, list(range(0, 34, 2))]
-        y_kpts = new_keypoints[:, list(range(1, 34, 2))]
-
-        x_kpts[np.logical_or.reduce((x_kpts < 0, x_kpts > self.size[0], y_kpts < 0, y_kpts > self.size[1]))] = 0
-        y_kpts[np.logical_or.reduce((x_kpts < 0, x_kpts > self.size[0], y_kpts < 0, y_kpts > self.size[1]))] = 0
-        new_keypoints[:, list(range(0, 34, 2))] = x_kpts
-        new_keypoints[:, list(range(1, 34, 2))] = y_kpts
-        return new_keypoints.reshape(n, 17, 2)
+        xy = np.ones((n * nkpt, 3))
+        visible = keypoints[..., 2].reshape(n * nkpt, 1)
+        xy[:, :2] = keypoints[..., :2].reshape(n * nkpt, 2)
+        xy = xy @ M.T  # transform
+        xy = xy[:, :2] / xy[:, 2:3]  # perspective rescale or affine
+        out_mask = (xy[:, 0] < 0) | (xy[:, 1] < 0) | (xy[:, 0] > self.size[0]) | (xy[:, 1] > self.size[1])
+        visible[out_mask] = 0
+        return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)

    def __call__(self, labels):
        """
@ -415,12 +412,13 @@ class RandomHSV:

 class RandomFlip:

-    def __init__(self, p=0.5, direction='horizontal') -> None:
+    def __init__(self, p=0.5, direction='horizontal', flip_idx=None) -> None:
        assert direction in ['horizontal', 'vertical'], f'Support direction `horizontal` or `vertical`, got {direction}'
        assert 0 <= p <= 1.0

        self.p = p
        self.direction = direction
+        self.flip_idx = flip_idx

    def __call__(self, labels):
        img = labels['img']
@ -437,6 +435,9 @@ class RandomFlip:
        if self.direction == 'horizontal' and random.random() < self.p:
            img = np.fliplr(img)
            instances.fliplr(w)
+            # for keypoints
+            if self.flip_idx is not None and instances.keypoints is not None:
+                instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :])
        labels['img'] = np.ascontiguousarray(img)
        labels['instances'] = instances
        return labels
@ -633,7 +634,7 @@ class Format:
        labels['cls'] = torch.from_numpy(cls) if nl else torch.zeros(nl)
        labels['bboxes'] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4))
        if self.return_keypoint:
-            labels['keypoints'] = torch.from_numpy(instances.keypoints) if nl else torch.zeros((nl, 17, 2))
+            labels['keypoints'] = torch.from_numpy(instances.keypoints)
        # then we can use collate_fn
        if self.batch_idx:
            labels['batch_idx'] = torch.zeros(nl)
@ -672,13 +673,17 @@ def v8_transforms(dataset, imgsz, hyp):
            perspective=hyp.perspective,
            pre_transform=LetterBox(new_shape=(imgsz, imgsz)),
        )])
+    flip_idx = dataset.data.get('flip_idx', None)  # for keypoints augmentation
+    if dataset.use_keypoints and flip_idx is None and hyp.fliplr > 0.0:
+        hyp.fliplr = 0.0
+        LOGGER.warning("WARNING ⚠️ No `flip_idx` provided while training keypoints, setting augmentation 'fliplr=0.0'")
    return Compose([
        pre_transform,
        MixUp(dataset, pre_transform=pre_transform, p=hyp.mixup),
        Albumentations(p=1.0),
        RandomHSV(hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v),
        RandomFlip(direction='vertical', p=hyp.flipud),
-        RandomFlip(direction='horizontal', p=hyp.fliplr)])  # transforms
+        RandomFlip(direction='horizontal', p=hyp.fliplr, flip_idx=flip_idx)])  # transforms


 # Classification augmentations -----------------------------------------------------------------------------------------
--- a/ultralytics/yolo/data/build.py
+++ b/ultralytics/yolo/data/build.py
@ -61,7 +61,7 @@ def seed_worker(worker_id):  # noqa
    random.seed(worker_seed)


-def build_dataloader(cfg, batch, img_path, stride=32, rect=False, names=None, rank=-1, mode='train'):
+def build_dataloader(cfg, batch, img_path, data_info, stride=32, rect=False, rank=-1, mode='train'):
    assert mode in ['train', 'val']
    shuffle = mode == 'train'
    if cfg.rect and shuffle:
@ -81,9 +81,9 @@ def build_dataloader(cfg, batch, img_path, stride=32, rect=False, names=None, ra
            pad=0.0 if mode == 'train' else 0.5,
            prefix=colorstr(f'{mode}: '),
            use_segments=cfg.task == 'segment',
-            use_keypoints=cfg.task == 'keypoint',
-            names=names,
-            classes=cfg.classes)
+            use_keypoints=cfg.task == 'pose',
+            classes=cfg.classes,
+            data=data_info)

    batch = min(batch, len(dataset))
    nd = torch.cuda.device_count()  # number of CUDA devices
--- a/ultralytics/yolo/data/dataset.py
+++ b/ultralytics/yolo/data/dataset.py
@ -57,11 +57,11 @@ class YOLODataset(BaseDataset):
                 single_cls=False,
                 use_segments=False,
                 use_keypoints=False,
-                 names=None,
+                 data=None,
                 classes=None):
        self.use_segments = use_segments
        self.use_keypoints = use_keypoints
-        self.names = names
+        self.data = data
        assert not (self.use_segments and self.use_keypoints), 'Can not use both segments and keypoints.'
        super().__init__(img_path, imgsz, cache, augment, hyp, prefix, rect, batch_size, stride, pad, single_cls,
                         classes)
@ -77,10 +77,16 @@ class YOLODataset(BaseDataset):
        nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number missing, found, empty, corrupt, messages
        desc = f'{self.prefix}Scanning {path.parent / path.stem}...'
        total = len(self.im_files)
+        nc = len(self.data['names'])
+        nkpt, ndim = self.data.get('kpt_shape', (0, 0))
+        if self.use_keypoints and (nkpt <= 0 or ndim not in (2, 3)):
+            raise ValueError("'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
+                             "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'")
        with ThreadPool(NUM_THREADS) as pool:
            results = pool.imap(func=verify_image_label,
                                iterable=zip(self.im_files, self.label_files, repeat(self.prefix),
-                                             repeat(self.use_keypoints), repeat(len(self.names))))
+                                             repeat(self.use_keypoints), repeat(len(self.data['names'])), repeat(nkpt),
+                                             repeat(ndim)))
            pbar = tqdm(results, desc=desc, total=total, bar_format=TQDM_BAR_FORMAT)
            for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
                nm += nm_f
--- a/ultralytics/yolo/data/utils.py
+++ b/ultralytics/yolo/data/utils.py
@ -6,10 +6,10 @@ import json
 import os
 import subprocess
 import time
+import zipfile
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
 from tarfile import is_tarfile
-from zipfile import is_zipfile

 import cv2
 import numpy as np
@ -61,7 +61,7 @@ def exif_size(img):

 def verify_image_label(args):
    # Verify one image-label pair
-    im_file, lb_file, prefix, keypoint, num_cls = args
+    im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim = args
    # number (missing, found, empty, corrupt), message, segments, keypoints
    nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, '', [], None
    try:
@ -92,25 +92,19 @@ def verify_image_label(args):
            nl = len(lb)
            if nl:
                if keypoint:
-                    assert lb.shape[1] == 56, 'labels require 56 columns each'
-                    assert (lb[:, 5::3] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
-                    assert (lb[:, 6::3] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
-                    kpts = np.zeros((lb.shape[0], 39))
-                    for i in range(len(lb)):
-                        kpt = np.delete(lb[i, 5:], np.arange(2, lb.shape[1] - 5, 3))  # remove occlusion param from GT
-                        kpts[i] = np.hstack((lb[i, :5], kpt))
-                    lb = kpts
-                    assert lb.shape[1] == 39, 'labels require 39 columns each after removing occlusion parameter'
+                    assert lb.shape[1] == (5 + nkpt * ndim), f'labels require {(5 + nkpt * ndim)} columns each'
+                    assert (lb[:, 5::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+                    assert (lb[:, 6::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
                else:
                    assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected'
                    assert (lb[:, 1:] <= 1).all(), \
                        f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}'
+                    assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}'
                # All labels
                max_cls = int(lb[:, 0].max())  # max label count
                assert max_cls <= num_cls, \
                    f'Label class {max_cls} exceeds dataset class count {num_cls}. ' \
                    f'Possible class labels are 0-{num_cls - 1}'
-                assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}'
                _, i = np.unique(lb, axis=0, return_index=True)
                if len(i) < nl:  # duplicate row check
                    lb = lb[i]  # remove duplicates
@ -119,12 +113,18 @@ def verify_image_label(args):
                    msg = f'{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed'
            else:
                ne = 1  # label empty
-                lb = np.zeros((0, 39), dtype=np.float32) if keypoint else np.zeros((0, 5), dtype=np.float32)
+                lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros(
+                    (0, 5), dtype=np.float32)
        else:
            nm = 1  # label missing
-            lb = np.zeros((0, 39), dtype=np.float32) if keypoint else np.zeros((0, 5), dtype=np.float32)
+            lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros((0, 5), dtype=np.float32)
        if keypoint:
-            keypoints = lb[:, 5:].reshape(-1, 17, 2)
+            keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
+            if ndim == 2:
+                kpt_mask = np.ones(keypoints.shape[:2], dtype=np.float32)
+                kpt_mask = np.where(keypoints[..., 0] < 0, 0.0, kpt_mask)
+                kpt_mask = np.where(keypoints[..., 1] < 0, 0.0, kpt_mask)
+                keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1)  # (nl, nkpt, 3)
        lb = lb[:, :5]
        return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
    except Exception as e:
@ -195,7 +195,7 @@ def check_det_dataset(dataset, autodownload=True):

    # Download (optional)
    extract_dir = ''
-    if isinstance(data, (str, Path)) and (is_zipfile(data) or is_tarfile(data)):
+    if isinstance(data, (str, Path)) and (zipfile.is_zipfile(data) or is_tarfile(data)):
        new_dir = safe_download(data, dir=DATASETS_DIR, unzip=True, delete=False, curl=False)
        data = next((DATASETS_DIR / new_dir).rglob('*.yaml'))
        extract_dir, autodownload = data.parent, False
@ -356,23 +356,8 @@ class HUBDatasetStats():
        assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
        return True, str(dir), self._find_yaml(dir)  # zipped, data_dir, yaml_path

-    def _hub_ops(self, f, max_dim=1920):
-        # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
-        f_new = self.im_dir / Path(f).name  # dataset-hub image filename
-        try:  # use PIL
-            im = Image.open(f)
-            r = max_dim / max(im.height, im.width)  # ratio
-            if r < 1.0:  # image too large
-                im = im.resize((int(im.width * r), int(im.height * r)))
-            im.save(f_new, 'JPEG', quality=50, optimize=True)  # save
-        except Exception as e:  # use OpenCV
-            LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}')
-            im = cv2.imread(f)
-            im_height, im_width = im.shape[:2]
-            r = max_dim / max(im_height, im_width)  # ratio
-            if r < 1.0:  # image too large
-                im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
-            cv2.imwrite(str(f_new), im)
+    def _hub_ops(self, f):
+        compress_one_image(f, self.im_dir / Path(f).name)  # save to dataset-hub

    def get_json(self, save=False, verbose=False):
        # Return dataset JSON for Ultralytics HUB
@ -426,3 +411,93 @@ class HUBDatasetStats():
                    pass
        LOGGER.info(f'Done. All images saved to {self.im_dir}')
        return self.im_dir
+
+
+def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
+    """
+    Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the
+    Python Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will
+    not be resized.
+
+    Args:
+        f (str): The path to the input image file.
+        f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
+        max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels.
+        quality (int, optional): The image compression quality as a percentage. Default is 50%.
+
+    Returns:
+        None
+
+    Usage:
+        from pathlib import Path
+        from ultralytics.yolo.data.utils import compress_one_image
+        for f in Path('/Users/glennjocher/Downloads/dataset').rglob('*.jpg'):
+            compress_one_image(f)
+    """
+    try:  # use PIL
+        im = Image.open(f)
+        r = max_dim / max(im.height, im.width)  # ratio
+        if r < 1.0:  # image too large
+            im = im.resize((int(im.width * r), int(im.height * r)))
+        im.save(f_new or f, 'JPEG', quality=quality, optimize=True)  # save
+    except Exception as e:  # use OpenCV
+        LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}')
+        im = cv2.imread(f)
+        im_height, im_width = im.shape[:2]
+        r = max_dim / max(im_height, im_width)  # ratio
+        if r < 1.0:  # image too large
+            im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
+        cv2.imwrite(str(f_new or f), im)
+
+
+def delete_dsstore(path):
+    """
+    Deletes all ".DS_store" files under a specified directory.
+
+    Args:
+        path (str, optional): The directory path where the ".DS_store" files should be deleted.
+
+    Returns:
+        None
+
+    Usage:
+        from ultralytics.yolo.data.utils import delete_dsstore
+        delete_dsstore('/Users/glennjocher/Downloads/dataset')
+
+    Note:
+        ".DS_store" files are created by the Apple operating system and contain metadata about folders and files. They
+        are hidden system files and can cause issues when transferring files between different operating systems.
+    """
+    # Delete Apple .DS_store files
+    files = list(Path(path).rglob('.DS_store'))
+    LOGGER.info(f'Deleting *.DS_store files: {files}')
+    for f in files:
+        f.unlink()
+
+
+def zip_directory(dir, use_zipfile_library=True):
+    """Zips a directory and saves the archive to the specified output path.
+
+    Args:
+        dir (str): The path to the directory to be zipped.
+        use_zipfile_library (bool): Whether to use zipfile library or shutil for zipping.
+
+    Returns:
+        None
+
+    Usage:
+        from ultralytics.yolo.data.utils import zip_directory
+        zip_directory('/Users/glennjocher/Downloads/playground')
+
+        zip -r coco8-pose.zip coco8-pose
+    """
+    delete_dsstore(dir)
+    if use_zipfile_library:
+        dir = Path(dir)
+        with zipfile.ZipFile(dir.with_suffix('.zip'), 'w', zipfile.ZIP_DEFLATED) as zip_file:
+            for file_path in dir.glob('**/*'):
+                if file_path.is_file():
+                    zip_file.write(file_path, file_path.relative_to(dir))
+    else:
+        import shutil
+        shutil.make_archive(dir, 'zip', dir)
--- a/ultralytics/yolo/engine/exporter.py
+++ b/ultralytics/yolo/engine/exporter.py
@ -209,8 +209,8 @@ class Exporter:
        self.file = file
        self.output_shape = tuple(y.shape) if isinstance(y, torch.Tensor) else tuple(tuple(x.shape) for x in y)
        self.pretty_name = Path(self.model.yaml.get('yaml_file', self.file)).stem.replace('yolo', 'YOLO')
-        description = f'Ultralytics {self.pretty_name} model ' + f'trained on {Path(self.args.data).name}' \
-            if self.args.data else '(untrained)'
+        trained_on = f'trained on {Path(self.args.data).name}' if self.args.data else '(untrained)'
+        description = f'Ultralytics {self.pretty_name} model {trained_on}'
        self.metadata = {
            'description': description,
            'author': 'Ultralytics',
@ -221,6 +221,8 @@ class Exporter:
            'batch': self.args.batch,
            'imgsz': self.imgsz,
            'names': model.names}  # model metadata
+        if model.task == 'pose':
+            self.metadata['kpt_shape'] = model.kpt_shape

        LOGGER.info(f"\n{colorstr('PyTorch:')} starting from {file} with input shape {tuple(im.shape)} BCHW and "
                    f'output shape(s) {self.output_shape} ({file_size(file):.1f} MB)')
@ -295,7 +297,8 @@ class Exporter:
        check_requirements(requirements)
        import onnx  # noqa

-        LOGGER.info(f'\n{prefix} starting export with onnx {onnx.__version__}...')
+        opset_version = self.args.opset or get_latest_opset()
+        LOGGER.info(f'\n{prefix} starting export with onnx {onnx.__version__} opset {opset_version}...')
        f = str(self.file.with_suffix('.onnx'))

        output_names = ['output0', 'output1'] if isinstance(self.model, SegmentationModel) else ['output0']
@ -313,7 +316,7 @@ class Exporter:
            self.im.cpu() if dynamic else self.im,
            f,
            verbose=False,
-            opset_version=self.args.opset or get_latest_opset(),
+            opset_version=opset_version,
            do_constant_folding=True,  # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False
            input_names=['images'],
            output_names=output_names,
@ -377,7 +380,6 @@ class Exporter:
        yaml_save(Path(f) / 'metadata.yaml', self.metadata)  # add metadata.yaml
        return f, None

-    @try_export
    def _export_coreml(self, prefix=colorstr('CoreML:')):
        # YOLOv8 CoreML export
        check_requirements('coremltools>=6.0')
@ -410,8 +412,8 @@ class Exporter:
            model = self.model
        elif self.model.task == 'detect':
            model = iOSDetectModel(self.model, self.im) if self.args.nms else self.model
-        elif self.model.task == 'segment':
-            # TODO CoreML Segmentation model pipelining
+        else:
+            # TODO CoreML Segment and Pose model pipelining
            model = self.model

        ts = torch.jit.trace(model.eval(), self.im, strict=False)  # TorchScript model
--- a/ultralytics/yolo/engine/model.py
+++ b/ultralytics/yolo/engine/model.py
@ -5,8 +5,8 @@ from pathlib import Path
 from typing import Union

 from ultralytics import yolo  # noqa
-from ultralytics.nn.tasks import (ClassificationModel, DetectionModel, SegmentationModel, attempt_load_one_weight,
-                                  guess_model_task, nn, yaml_model_load)
+from ultralytics.nn.tasks import (ClassificationModel, DetectionModel, PoseModel, SegmentationModel,
+                                  attempt_load_one_weight, guess_model_task, nn, yaml_model_load)
 from ultralytics.yolo.cfg import get_cfg
 from ultralytics.yolo.engine.exporter import Exporter
 from ultralytics.yolo.utils import (DEFAULT_CFG, DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, RANK, ROOT, callbacks,
@ -25,7 +25,8 @@ TASK_MAP = {
        yolo.v8.detect.DetectionPredictor],
    'segment': [
        SegmentationModel, yolo.v8.segment.SegmentationTrainer, yolo.v8.segment.SegmentationValidator,
-        yolo.v8.segment.SegmentationPredictor]}
+        yolo.v8.segment.SegmentationPredictor],
+    'pose': [PoseModel, yolo.v8.pose.PoseTrainer, yolo.v8.pose.PoseValidator, yolo.v8.pose.PosePredictor]}


 class YOLO:
@ -195,7 +196,7 @@ class YOLO:
        self.model.load(weights)
        return self

-    def info(self, verbose=False):
+    def info(self, verbose=True):
        """
        Logs model info.

--- a/ultralytics/yolo/engine/predictor.py
+++ b/ultralytics/yolo/engine/predictor.py
@ -246,6 +246,7 @@ class BasePredictor:
                                 dnn=self.args.dnn,
                                 data=self.args.data,
                                 fp16=self.args.half,
+                                 fuse=True,
                                 verbose=verbose)
        self.device = device
        self.model.eval()
--- a/ultralytics/yolo/engine/results.py
+++ b/ultralytics/yolo/engine/results.py
@ -17,6 +17,53 @@ from ultralytics.yolo.utils.plotting import Annotator, colors
 from ultralytics.yolo.utils.torch_utils import TORCHVISION_0_10


+class BaseTensor(SimpleClass):
+    """
+
+    Attributes:
+        tensor (torch.Tensor): A tensor.
+        orig_shape (tuple): Original image size, in the format (height, width).
+
+    Methods:
+        cpu(): Returns a copy of the tensor on CPU memory.
+        numpy(): Returns a copy of the tensor as a numpy array.
+        cuda(): Returns a copy of the tensor on GPU memory.
+        to(): Returns a copy of the tensor with the specified device and dtype.
+    """
+
+    def __init__(self, tensor, orig_shape) -> None:
+        super().__init__()
+        assert isinstance(tensor, torch.Tensor)
+        self.tensor = tensor
+        self.orig_shape = orig_shape
+
+    @property
+    def shape(self):
+        return self.data.shape
+
+    @property
+    def data(self):
+        return self.tensor
+
+    def cpu(self):
+        return self.__class__(self.data.cpu(), self.orig_shape)
+
+    def numpy(self):
+        return self.__class__(self.data.numpy(), self.orig_shape)
+
+    def cuda(self):
+        return self.__class__(self.data.cuda(), self.orig_shape)
+
+    def to(self, *args, **kwargs):
+        return self.__class__(self.data.to(*args, **kwargs), self.orig_shape)
+
+    def __len__(self):  # override len(results)
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.__class__(self.data[idx], self.orig_shape)
+
+
 class Results(SimpleClass):
    """
    A class for storing and manipulating inference results.
@ -40,22 +87,23 @@ class Results(SimpleClass):
        _keys (tuple): A tuple of attribute names for non-empty attributes.
    """

-    def __init__(self, orig_img, path, names, boxes=None, masks=None, probs=None) -> None:
+    def __init__(self, orig_img, path, names, boxes=None, masks=None, probs=None, keypoints=None) -> None:
        self.orig_img = orig_img
        self.orig_shape = orig_img.shape[:2]
        self.boxes = Boxes(boxes, self.orig_shape) if boxes is not None else None  # native size boxes
        self.masks = Masks(masks, self.orig_shape) if masks is not None else None  # native size or imgsz masks
        self.probs = probs if probs is not None else None
+        self.keypoints = keypoints if keypoints is not None else None
        self.names = names
        self.path = path
-        self._keys = ('boxes', 'masks', 'probs')
+        self._keys = ('boxes', 'masks', 'probs', 'keypoints')

    def pandas(self):
        pass
        # TODO masks.pandas + boxes.pandas + cls.pandas

    def __getitem__(self, idx):
-        r = Results(orig_img=self.orig_img, path=self.path, names=self.names)
+        r = self.new()
        for k in self.keys:
            setattr(r, k, getattr(self, k)[idx])
        return r
@ -69,25 +117,25 @@ class Results(SimpleClass):
            self.probs = probs

    def cpu(self):
-        r = Results(orig_img=self.orig_img, path=self.path, names=self.names)
+        r = self.new()
        for k in self.keys:
            setattr(r, k, getattr(self, k).cpu())
        return r

    def numpy(self):
-        r = Results(orig_img=self.orig_img, path=self.path, names=self.names)
+        r = self.new()
        for k in self.keys:
            setattr(r, k, getattr(self, k).numpy())
        return r

    def cuda(self):
-        r = Results(orig_img=self.orig_img, path=self.path, names=self.names)
+        r = self.new()
        for k in self.keys:
            setattr(r, k, getattr(self, k).cuda())
        return r

    def to(self, *args, **kwargs):
-        r = Results(orig_img=self.orig_img, path=self.path, names=self.names)
+        r = self.new()
        for k in self.keys:
            setattr(r, k, getattr(self, k).to(*args, **kwargs))
        return r
@ -96,6 +144,9 @@ class Results(SimpleClass):
        for k in self.keys:
            return len(getattr(self, k))

+    def new(self):
+        return Results(orig_img=self.orig_img, path=self.path, names=self.names)
+
    @property
    def keys(self):
        return [k for k in self._keys if getattr(self, k) is not None]
@ -109,6 +160,7 @@ class Results(SimpleClass):
            pil=False,
            example='abc',
            img=None,
+            kpt_line=True,
            labels=True,
            boxes=True,
            masks=True,
@ -126,6 +178,7 @@ class Results(SimpleClass):
            pil (bool): Whether to return the image as a PIL Image.
            example (str): An example string to display. Useful for indicating the expected format of the output.
            img (numpy.ndarray): Plot to another image. if not, plot to original image.
+            kpt_line (bool): Whether to draw lines connecting keypoints.
            labels (bool): Whether to plot the label of bounding boxes.
            boxes (bool): Whether to plot the bounding boxes.
            masks (bool): Whether to plot the masks.
@ -146,11 +199,12 @@ class Results(SimpleClass):
        pred_masks, show_masks = self.masks, masks
        pred_probs, show_probs = self.probs, probs
        names = self.names
+        keypoints = self.keypoints
        if pred_boxes and show_boxes:
            for d in reversed(pred_boxes):
                c, conf, id = int(d.cls), float(d.conf) if conf else None, None if d.id is None else int(d.id.item())
                name = ('' if id is None else f'id:{id} ') + names[c]
-                label = (name if not conf else f'{name} {conf:.2f}') if labels else None
+                label = (f'{name} {conf:.2f}' if conf else name) if labels else None
                annotator.box_label(d.xyxy.squeeze(), label, color=colors(c, True))

        if pred_masks and show_masks:
@ -168,10 +222,14 @@ class Results(SimpleClass):
            text = f"{', '.join(f'{names[j] if names else j} {pred_probs[j]:.2f}' for j in top5i)}, "
            annotator.text((32, 32), text, txt_color=(255, 255, 255))  # TODO: allow setting colors

+        if keypoints is not None:
+            for k in reversed(keypoints):
+                annotator.kpts(k, self.orig_shape, kpt_line=kpt_line)
+
        return np.asarray(annotator.im) if annotator.pil else annotator.im


-class Boxes(SimpleClass):
+class Boxes(BaseTensor):
    """
    A class for storing and manipulating detection boxes.

@ -246,37 +304,15 @@ class Boxes(SimpleClass):
    def xywhn(self):
        return self.xywh / self.orig_shape[[1, 0, 1, 0]]

-    def cpu(self):
-        return Boxes(self.boxes.cpu(), self.orig_shape)
-
-    def numpy(self):
-        return Boxes(self.boxes.numpy(), self.orig_shape)
-
-    def cuda(self):
-        return Boxes(self.boxes.cuda(), self.orig_shape)
-
-    def to(self, *args, **kwargs):
-        return Boxes(self.boxes.to(*args, **kwargs), self.orig_shape)
-
    def pandas(self):
        LOGGER.info('results.pandas() method not yet implemented')

-    @property
-    def shape(self):
-        return self.boxes.shape
-
    @property
    def data(self):
        return self.boxes

-    def __len__(self):  # override len(results)
-        return len(self.boxes)

-    def __getitem__(self, idx):
-        return Boxes(self.boxes[idx], self.orig_shape)
-
-
-class Masks(SimpleClass):
+class Masks(BaseTensor):
    """
    A class for storing and manipulating detection masks.

@ -316,7 +352,7 @@ class Masks(SimpleClass):
    def xyn(self):
        # Segments (normalized)
        return [
-            ops.scale_segments(self.masks.shape[1:], x, self.orig_shape, normalize=True)
+            ops.scale_coords(self.masks.shape[1:], x, self.orig_shape, normalize=True)
            for x in ops.masks2segments(self.masks)]

    @property
@ -324,31 +360,9 @@ class Masks(SimpleClass):
    def xy(self):
        # Segments (pixels)
        return [
-            ops.scale_segments(self.masks.shape[1:], x, self.orig_shape, normalize=False)
+            ops.scale_coords(self.masks.shape[1:], x, self.orig_shape, normalize=False)
            for x in ops.masks2segments(self.masks)]

-    @property
-    def shape(self):
-        return self.masks.shape
-
    @property
    def data(self):
        return self.masks
-
-    def cpu(self):
-        return Masks(self.masks.cpu(), self.orig_shape)
-
-    def numpy(self):
-        return Masks(self.masks.numpy(), self.orig_shape)
-
-    def cuda(self):
-        return Masks(self.masks.cuda(), self.orig_shape)
-
-    def to(self, *args, **kwargs):
-        return Masks(self.masks.to(*args, **kwargs), self.orig_shape)
-
-    def __len__(self):  # override len(results)
-        return len(self.masks)
-
-    def __getitem__(self, idx):
-        return Masks(self.masks[idx], self.orig_shape)
--- a/ultralytics/yolo/utils/benchmarks.py
+++ b/ultralytics/yolo/utils/benchmarks.py
@ -75,11 +75,13 @@ def benchmark(model=Path(SETTINGS['weights_dir']) / 'yolov8n.pt', imgsz=160, hal

            # Validate
            if model.task == 'detect':
-                data, key = 'coco128.yaml', 'metrics/mAP50-95(B)'
+                data, key = 'coco8.yaml', 'metrics/mAP50-95(B)'
            elif model.task == 'segment':
-                data, key = 'coco128-seg.yaml', 'metrics/mAP50-95(M)'
+                data, key = 'coco8-seg.yaml', 'metrics/mAP50-95(M)'
            elif model.task == 'classify':
                data, key = 'imagenet100', 'metrics/accuracy_top5'
+            elif model.task == 'pose':
+                data, key = 'coco8-pose.yaml', 'metrics/mAP50-95(P)'

            results = export.val(data=data, batch=1, imgsz=imgsz, plots=False, device=device, half=half, verbose=False)
            metric, speed = results.results_dict[key], results.speed['inference']
--- a/ultralytics/yolo/utils/downloads.py
+++ b/ultralytics/yolo/utils/downloads.py
@ -14,9 +14,9 @@ from tqdm import tqdm

 from ultralytics.yolo.utils import LOGGER, checks, emojis, is_online

-GITHUB_ASSET_NAMES = [f'yolov8{size}{suffix}.pt' for size in 'nsmlx' for suffix in ('', '6', '-cls', '-seg')] + \
-                     [f'yolov5{size}u.pt' for size in 'nsmlx'] + \
-                     [f'yolov3{size}u.pt' for size in ('', '-spp', '-tiny')]
+GITHUB_ASSET_NAMES = [f'yolov8{k}{suffix}.pt' for k in 'nsmlx' for suffix in ('', '6', '-cls', '-seg', '-pose')] + \
+                     [f'yolov5{k}u.pt' for k in 'nsmlx'] + \
+                     [f'yolov3{k}u.pt' for k in ('', '-spp', '-tiny')]
 GITHUB_ASSET_STEMS = [Path(k).stem for k in GITHUB_ASSET_NAMES]


--- a/ultralytics/yolo/utils/instance.py
+++ b/ultralytics/yolo/utils/instance.py
@ -168,7 +168,7 @@ class Instances:
        Args:
            bboxes (ndarray): bboxes with shape [N, 4].
            segments (list | ndarray): segments.
-            keypoints (ndarray): keypoints with shape [N, 17, 2].
+            keypoints (ndarray): keypoints(x, y, visible) with shape [N, 17, 3].
        """
        if segments is None:
            segments = []
--- a/ultralytics/yolo/utils/loss.py
+++ b/ultralytics/yolo/utils/loss.py
@ -54,3 +54,17 @@ class BboxLoss(nn.Module):
        wr = 1 - wl  # weight right
        return (F.cross_entropy(pred_dist, tl.view(-1), reduction='none').view(tl.shape) * wl +
                F.cross_entropy(pred_dist, tr.view(-1), reduction='none').view(tl.shape) * wr).mean(-1, keepdim=True)
+
+
+class KeypointLoss(nn.Module):
+
+    def __init__(self, sigmas) -> None:
+        super().__init__()
+        self.sigmas = sigmas
+
+    def forward(self, pred_kpts, gt_kpts, kpt_mask, area):
+        d = (pred_kpts[..., 0] - gt_kpts[..., 0]) ** 2 + (pred_kpts[..., 1] - gt_kpts[..., 1]) ** 2
+        kpt_loss_factor = (torch.sum(kpt_mask != 0) + torch.sum(kpt_mask == 0)) / (torch.sum(kpt_mask != 0) + 1e-9)
+        # e = d / (2 * (area * self.sigmas) ** 2 + 1e-9)  # from formula
+        e = d / (2 * self.sigmas) ** 2 / (area + 1e-9) / 2  # from cocoeval
+        return kpt_loss_factor * ((1 - torch.exp(-e)) * kpt_mask).mean()
--- a/ultralytics/yolo/utils/metrics.py
+++ b/ultralytics/yolo/utils/metrics.py
@ -13,6 +13,8 @@ import torch.nn as nn

 from ultralytics.yolo.utils import LOGGER, SimpleClass, TryExcept

+OKS_SIGMA = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0
+

 # boxes
 def box_area(box):
@ -108,8 +110,8 @@ def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7

 def mask_iou(mask1, mask2, eps=1e-7):
    """
-    mask1: [N, n] m1 means number of predicted objects
-    mask2: [M, n] m2 means number of gt objects
+    mask1: [N, n] m1 means number of gt objects
+    mask2: [M, n] m2 means number of predicted objects
    Note: n means image_w x image_h
    Returns: masks iou, [N, M]
    """
@ -118,16 +120,18 @@ def mask_iou(mask1, mask2, eps=1e-7):
    return intersection / (union + eps)


-def masks_iou(mask1, mask2, eps=1e-7):
+def kpt_iou(kpt1, kpt2, area, sigma, eps=1e-7):
+    """OKS
+    kpt1: [N, 17, 3], gt
+    kpt2: [M, 17, 3], pred
+    area: [N], areas from gt
    """
-    mask1: [N, n] m1 means number of predicted objects
-    mask2: [N, n] m2 means number of gt objects
-    Note: n means image_w x image_h
-    Returns: masks iou, (N, )
-    """
-    intersection = (mask1 * mask2).sum(1).clamp(0)  # (N, )
-    union = (mask1.sum(1) + mask2.sum(1))[None] - intersection  # (area1 + area2) - intersection
-    return intersection / (union + eps)
+    d = (kpt1[:, None, :, 0] - kpt2[..., 0]) ** 2 + (kpt1[:, None, :, 1] - kpt2[..., 1]) ** 2  # (N, M, 17)
+    sigma = torch.tensor(sigma, device=kpt1.device, dtype=kpt1.dtype)  # (17, )
+    kpt_mask = kpt1[..., 2] != 0  # (N, 17)
+    e = d / (2 * sigma) ** 2 / (area[:, None, None] + eps) / 2  # from cocoeval
+    # e = d / ((area[None, :, None] + eps) * sigma) ** 2 / 2  # from formula
+    return (torch.exp(-e) * kpt_mask[:, None]).sum(-1) / (kpt_mask.sum(-1)[:, None] + eps)


 def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
@ -649,13 +653,13 @@ class SegmentMetrics(SimpleClass):
        self.seg = Metric()
        self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0}

-    def process(self, tp_m, tp_b, conf, pred_cls, target_cls):
+    def process(self, tp_b, tp_m, conf, pred_cls, target_cls):
        """
        Processes the detection and segmentation metrics over the given set of predictions.

        Args:
-            tp_m (list): List of True Positive masks.
            tp_b (list): List of True Positive boxes.
+            tp_m (list): List of True Positive masks.
            conf (list): List of confidence scores.
            pred_cls (list): List of predicted classes.
            target_cls (list): List of target classes.
@ -712,6 +716,100 @@ class SegmentMetrics(SimpleClass):
        return dict(zip(self.keys + ['fitness'], self.mean_results() + [self.fitness]))


+class PoseMetrics(SegmentMetrics):
+    """
+    Calculates and aggregates detection and pose metrics over a given set of classes.
+
+    Args:
+        save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory.
+        plot (bool): Whether to save the detection and segmentation plots. Default is False.
+        names (list): List of class names. Default is an empty list.
+
+    Attributes:
+        save_dir (Path): Path to the directory where the output plots should be saved.
+        plot (bool): Whether to save the detection and segmentation plots.
+        names (list): List of class names.
+        box (Metric): An instance of the Metric class to calculate box detection metrics.
+        pose (Metric): An instance of the Metric class to calculate mask segmentation metrics.
+        speed (dict): Dictionary to store the time taken in different phases of inference.
+
+    Methods:
+        process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions.
+        mean_results(): Returns the mean of the detection and segmentation metrics over all the classes.
+        class_result(i): Returns the detection and segmentation metrics of class `i`.
+        maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95.
+        fitness: Returns the fitness scores, which are a single weighted combination of metrics.
+        ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP).
+        results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score.
+    """
+
+    def __init__(self, save_dir=Path('.'), plot=False, names=()) -> None:
+        super().__init__(save_dir, plot, names)
+        self.save_dir = save_dir
+        self.plot = plot
+        self.names = names
+        self.box = Metric()
+        self.pose = Metric()
+        self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0}
+
+    def __getattr__(self, attr):
+        name = self.__class__.__name__
+        raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}")
+
+    def process(self, tp_b, tp_p, conf, pred_cls, target_cls):
+        """
+        Processes the detection and pose metrics over the given set of predictions.
+
+        Args:
+            tp_b (list): List of True Positive boxes.
+            tp_p (list): List of True Positive keypoints.
+            conf (list): List of confidence scores.
+            pred_cls (list): List of predicted classes.
+            target_cls (list): List of target classes.
+        """
+
+        results_pose = ap_per_class(tp_p,
+                                    conf,
+                                    pred_cls,
+                                    target_cls,
+                                    plot=self.plot,
+                                    save_dir=self.save_dir,
+                                    names=self.names,
+                                    prefix='Pose')[2:]
+        self.pose.nc = len(self.names)
+        self.pose.update(results_pose)
+        results_box = ap_per_class(tp_b,
+                                   conf,
+                                   pred_cls,
+                                   target_cls,
+                                   plot=self.plot,
+                                   save_dir=self.save_dir,
+                                   names=self.names,
+                                   prefix='Box')[2:]
+        self.box.nc = len(self.names)
+        self.box.update(results_box)
+
+    @property
+    def keys(self):
+        return [
+            'metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP50(B)', 'metrics/mAP50-95(B)',
+            'metrics/precision(P)', 'metrics/recall(P)', 'metrics/mAP50(P)', 'metrics/mAP50-95(P)']
+
+    def mean_results(self):
+        return self.box.mean_results() + self.pose.mean_results()
+
+    def class_result(self, i):
+        return self.box.class_result(i) + self.pose.class_result(i)
+
+    @property
+    def maps(self):
+        return self.box.maps + self.pose.maps
+
+    @property
+    def fitness(self):
+        return self.pose.fitness() + self.box.fitness()
+
+
 class ClassifyMetrics(SimpleClass):
    """
    Class for computing classification metrics including top-1 and top-5 accuracy.
--- a/ultralytics/yolo/utils/ops.py
+++ b/ultralytics/yolo/utils/ops.py
@ -281,28 +281,23 @@ def clip_boxes(boxes, shape):
        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2


-def clip_coords(boxes, shape):
+def clip_coords(coords, shape):
    """
-    Clip bounding xyxy bounding boxes to image shape (height, width).
+    Clip line coordinates to the image boundaries.

    Args:
-        boxes (torch.Tensor or numpy.ndarray): Bounding boxes to be clipped.
-        shape (tuple): The shape of the image. (height, width)
+        coords (torch.Tensor) or (numpy.ndarray): A list of line coordinates.
+        shape (tuple): A tuple of integers representing the size of the image in the format (height, width).

    Returns:
-        None
-
-    Note:
-        The input `boxes` is modified in-place, there is no return value.
+        (None): The function modifies the input `coordinates` in place, by clipping each coordinate to the image boundaries.
    """
-    if isinstance(boxes, torch.Tensor):  # faster individually
-        boxes[:, 0].clamp_(0, shape[1])  # x1
-        boxes[:, 1].clamp_(0, shape[0])  # y1
-        boxes[:, 2].clamp_(0, shape[1])  # x2
-        boxes[:, 3].clamp_(0, shape[0])  # y2
+    if isinstance(coords, torch.Tensor):  # faster individually
+        coords[..., 0].clamp_(0, shape[1])  # x
+        coords[..., 1].clamp_(0, shape[0])  # y
    else:  # np.array (faster grouped)
-        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+        coords[..., 0] = coords[..., 0].clip(0, shape[1])  # x
+        coords[..., 1] = coords[..., 1].clip(0, shape[0])  # y


 def scale_image(im1_shape, masks, im0_shape, ratio_pad=None):
@ -577,17 +572,18 @@ def process_mask_upsample(protos, masks_in, bboxes, shape):

 def process_mask(protos, masks_in, bboxes, shape, upsample=False):
    """
-    It takes the output of the mask head, and applies the mask to the bounding boxes. This is faster but produces
-    downsampled quality of mask
+    Apply masks to bounding boxes using the output of the mask head.

    Args:
-      protos (torch.Tensor): [mask_dim, mask_h, mask_w]
-      masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms
-      bboxes (torch.Tensor): [n, 4], n is number of masks after nms
-      shape (tuple): the size of the input image (h,w)
+        protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w].
+        masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
+        bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
+        shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
+        upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.

    Returns:
-      (torch.Tensor): The processed masks.
+        (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
+            are the height and width of the input image. The mask is applied to the bounding boxes.
    """

    c, mh, mw = protos.shape  # CHW
@ -632,19 +628,19 @@ def process_mask_native(protos, masks_in, bboxes, shape):
    return masks.gt_(0.5)


-def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None, normalize=False):
+def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize=False):
    """
    Rescale segment coordinates (xyxy) from img1_shape to img0_shape

    Args:
-      img1_shape (tuple): The shape of the image that the segments are from.
-      segments (torch.Tensor): the segments to be scaled
+      img1_shape (tuple): The shape of the image that the coords are from.
+      coords (torch.Tensor): the coords to be scaled
      img0_shape (tuple): the shape of the image that the segmentation is being applied to
      ratio_pad (tuple): the ratio of the image size to the padded image size.
      normalize (bool): If True, the coordinates will be normalized to the range [0, 1]. Defaults to False

    Returns:
-      segments (torch.Tensor): the segmented image.
+      coords (torch.Tensor): the segmented image.
    """
    if ratio_pad is None:  # calculate from img0_shape
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
@ -653,14 +649,15 @@ def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None, normalize=F
        gain = ratio_pad[0][0]
        pad = ratio_pad[1]

-    segments[:, 0] -= pad[0]  # x padding
-    segments[:, 1] -= pad[1]  # y padding
-    segments /= gain
-    clip_segments(segments, img0_shape)
+    coords[..., 0] -= pad[0]  # x padding
+    coords[..., 1] -= pad[1]  # y padding
+    coords[..., 0] /= gain
+    coords[..., 1] /= gain
+    clip_coords(coords, img0_shape)
    if normalize:
-        segments[:, 0] /= img0_shape[1]  # width
-        segments[:, 1] /= img0_shape[0]  # height
-    return segments
+        coords[..., 0] /= img0_shape[1]  # width
+        coords[..., 1] /= img0_shape[0]  # height
+    return coords


 def masks2segments(masks, strategy='largest'):
@ -688,23 +685,6 @@ def masks2segments(masks, strategy='largest'):
    return segments


-def clip_segments(segments, shape):
-    """
-    It takes a list of line segments (x1,y1,x2,y2) and clips them to the image shape (height, width)
-
-    Args:
-      segments (list): a list of segments, each segment is a list of points, each point is a list of x,y
-    coordinates
-      shape (tuple): the shape of the image
-    """
-    if isinstance(segments, torch.Tensor):  # faster individually
-        segments[:, 0].clamp_(0, shape[1])  # x
-        segments[:, 1].clamp_(0, shape[0])  # y
-    else:  # np.array (faster grouped)
-        segments[:, 0] = segments[:, 0].clip(0, shape[1])  # x
-        segments[:, 1] = segments[:, 1].clip(0, shape[0])  # y
-
-
 def clean_str(s):
    """
    Cleans a string by replacing special characters with underscore _
--- a/ultralytics/yolo/utils/plotting.py
+++ b/ultralytics/yolo/utils/plotting.py
@ -16,7 +16,7 @@ from ultralytics.yolo.utils import LOGGER, TryExcept, threaded

 from .checks import check_font, check_version, is_ascii
 from .files import increment_path
-from .ops import clip_coords, scale_image, xywh2xyxy, xyxy2xywh
+from .ops import clip_boxes, scale_image, xywh2xyxy, xyxy2xywh

 matplotlib.rc('font', **{'size': 11})
 matplotlib.use('Agg')  # for writing to files only
@ -30,6 +30,11 @@ class Colors:
                '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7')
        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
        self.n = len(self.palette)
+        self.pose_palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102], [230, 230, 0], [255, 153, 255],
+                                      [153, 204, 255], [255, 102, 255], [255, 51, 255], [102, 178, 255], [51, 153, 255],
+                                      [255, 153, 153], [255, 102, 102], [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                                      [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0], [255, 255, 255]],
+                                     dtype=np.uint8)

    def __call__(self, i, bgr=False):
        c = self.palette[int(i) % self.n]
@ -62,6 +67,12 @@ class Annotator:
        else:  # use cv2
            self.im = im
        self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2)  # line width
+        # pose
+        self.skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], [7, 9],
+                         [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
+
+        self.limb_color = colors.pose_palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
+        self.kpt_color = colors.pose_palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]

    def box_label(self, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255)):
        # Add one xyxy box to image with label
@ -132,6 +143,49 @@ class Annotator:
            # convert im back to PIL and update draw
            self.fromarray(self.im)

+    def kpts(self, kpts, shape=(640, 640), radius=5, kpt_line=True):
+        """Plot keypoints.
+        Args:
+            kpts (tensor): predicted kpts, shape: [17, 3]
+            shape (tuple): image shape, (h, w)
+            steps (int): keypoints step
+            radius (int): size of drawing points
+        """
+        if self.pil:
+            # convert to numpy first
+            self.im = np.asarray(self.im).copy()
+        nkpt, ndim = kpts.shape
+        is_pose = nkpt == 17 and ndim == 3
+        kpt_line &= is_pose  # `kpt_line=True` for now only supports human pose plotting
+        for i, k in enumerate(kpts):
+            color_k = [int(x) for x in self.kpt_color[i]] if is_pose else colors(i)
+            x_coord, y_coord = k[0], k[1]
+            if x_coord % shape[1] != 0 and y_coord % shape[0] != 0:
+                if len(k) == 3:
+                    conf = k[2]
+                    if conf < 0.5:
+                        continue
+                cv2.circle(self.im, (int(x_coord), int(y_coord)), radius, color_k, -1)
+
+        if kpt_line:
+            ndim = kpts.shape[-1]
+            for sk_id, sk in enumerate(self.skeleton):
+                pos1 = (int(kpts[(sk[0] - 1), 0]), int(kpts[(sk[0] - 1), 1]))
+                pos2 = (int(kpts[(sk[1] - 1), 0]), int(kpts[(sk[1] - 1), 1]))
+                if ndim == 3:
+                    conf1 = kpts[(sk[0] - 1), 2]
+                    conf2 = kpts[(sk[1] - 1), 2]
+                    if conf1 < 0.5 or conf2 < 0.5:
+                        continue
+                if pos1[0] % shape[1] == 0 or pos1[1] % shape[0] == 0 or pos1[0] < 0 or pos1[1] < 0:
+                    continue
+                if pos2[0] % shape[1] == 0 or pos2[1] % shape[0] == 0 or pos2[0] < 0 or pos2[1] < 0:
+                    continue
+                cv2.line(self.im, pos1, pos2, [int(x) for x in self.limb_color[sk_id]], thickness=2)
+        if self.pil:
+            # convert im back to PIL and update draw
+            self.fromarray(self.im)
+
    def rectangle(self, xy, fill=None, outline=None, width=1):
        # Add rectangle to image (PIL-only)
        self.draw.rectangle(xy, fill, outline, width)
@ -213,7 +267,7 @@ def save_one_box(xyxy, im, file=Path('im.jpg'), gain=1.02, pad=10, square=False,
        b[:, 2:] = b[:, 2:].max(1)[0].unsqueeze(1)  # attempt rectangle to square
    b[:, 2:] = b[:, 2:] * gain + pad  # box wh * gain + pad
    xyxy = xywh2xyxy(b).long()
-    clip_coords(xyxy, im.shape)
+    clip_boxes(xyxy, im.shape)
    crop = im[int(xyxy[0, 1]):int(xyxy[0, 3]), int(xyxy[0, 0]):int(xyxy[0, 2]), ::(1 if BGR else -1)]
    if save:
        file.parent.mkdir(parents=True, exist_ok=True)  # make directory
@ -229,6 +283,7 @@ def plot_images(images,
                cls,
                bboxes,
                masks=np.zeros(0, dtype=np.uint8),
+                kpts=np.zeros((0, 51), dtype=np.float32),
                paths=None,
                fname='images.jpg',
                names=None):
@ -241,6 +296,8 @@ def plot_images(images,
        bboxes = bboxes.cpu().numpy()
    if isinstance(masks, torch.Tensor):
        masks = masks.cpu().numpy().astype(int)
+    if isinstance(kpts, torch.Tensor):
+        kpts = kpts.cpu().numpy()
    if isinstance(batch_idx, torch.Tensor):
        batch_idx = batch_idx.cpu().numpy()

@ -300,6 +357,21 @@ def plot_images(images,
                    label = f'{c}' if labels else f'{c} {conf[j]:.1f}'
                    annotator.box_label(box, label, color=color)

+            # Plot keypoints
+            if len(kpts):
+                kpts_ = kpts[idx].copy()
+                if len(kpts_):
+                    if kpts_[..., 0].max() <= 1.01 or kpts_[..., 1].max() <= 1.01:  # if normalized with tolerance .01
+                        kpts_[..., 0] *= w  # scale to pixels
+                        kpts_[..., 1] *= h
+                    elif scale < 1:  # absolute coords need scale if image scales
+                        kpts_ *= scale
+                kpts_[..., 0] += x
+                kpts_[..., 1] += y
+                for j in range(len(kpts_)):
+                    if labels or conf[j] > 0.25:  # 0.25 conf thresh
+                        annotator.kpts(kpts_[j])
+
            # Plot masks
            if len(masks):
                if idx.shape[0] == masks.shape[0]:  # overlap_masks=False
@ -307,7 +379,7 @@ def plot_images(images,
                else:  # overlap_masks=True
                    image_masks = masks[[i]]  # (1, 640, 640)
                    nl = idx.sum()
-                    index = np.arange(nl).reshape(nl, 1, 1) + 1
+                    index = np.arange(nl).reshape((nl, 1, 1)) + 1
                    image_masks = np.repeat(image_masks, nl, axis=0)
                    image_masks = np.where(image_masks == index, 1.0, 0.0)

@ -328,13 +400,16 @@ def plot_images(images,
    annotator.im.save(fname)  # save


-def plot_results(file='path/to/results.csv', dir='', segment=False):
+def plot_results(file='path/to/results.csv', dir='', segment=False, pose=False):
    # Plot training results.csv. Usage: from utils.plots import *; plot_results('path/to/results.csv')
    import pandas as pd
    save_dir = Path(file).parent if file else Path(dir)
    if segment:
        fig, ax = plt.subplots(2, 8, figsize=(18, 6), tight_layout=True)
        index = [1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 15, 16, 7, 8, 11, 12]
+    elif pose:
+        fig, ax = plt.subplots(2, 9, figsize=(21, 6), tight_layout=True)
+        index = [1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, 16, 17, 18, 8, 9, 12, 13]
    else:
        fig, ax = plt.subplots(2, 5, figsize=(12, 6), tight_layout=True)
        index = [1, 2, 3, 4, 5, 8, 9, 10, 6, 7]
--- a/ultralytics/yolo/utils/torch_utils.py
+++ b/ultralytics/yolo/utils/torch_utils.py
@ -240,8 +240,8 @@ def copy_attr(a, b, include=(), exclude=()):


 def get_latest_opset():
-    # Return max supported ONNX opset by this version of torch
-    return max(int(k[14:]) for k in vars(torch.onnx) if 'symbolic_opset' in k)  # opset
+    # Return second-most (for maturity) recently supported ONNX opset by this version of torch
+    return max(int(k[14:]) for k in vars(torch.onnx) if 'symbolic_opset' in k) - 1  # opset


 def intersect_dicts(da, db, exclude=()):
@ -318,18 +318,18 @@ def strip_optimizer(f: Union[str, Path] = 'best.pt', s: str = '') -> None:
    """
    Strip optimizer from 'f' to finalize training, optionally save as 's'.

-    Usage:
-        from ultralytics.yolo.utils.torch_utils import strip_optimizer
-        from pathlib import Path
-        for f in Path('/Users/glennjocher/Downloads/weights').glob('*.pt'):
-            strip_optimizer(f)
-
    Args:
        f (str): file path to model to strip the optimizer from. Default is 'best.pt'.
        s (str): file path to save the model with stripped optimizer to. If not provided, 'f' will be overwritten.

    Returns:
        None
+
+    Usage:
+        from pathlib import Path
+        from ultralytics.yolo.utils.torch_utils import strip_optimizer
+        for f in Path('/Users/glennjocher/Downloads/weights').rglob('*.pt'):
+            strip_optimizer(f)
    """
    x = torch.load(f, map_location=torch.device('cpu'))
    args = {**DEFAULT_CFG_DICT, **x['train_args']}  # combine model args with default args, preferring model args
@ -349,7 +349,9 @@ def strip_optimizer(f: Union[str, Path] = 'best.pt', s: str = '') -> None:


 def profile(input, ops, n=10, device=None):
-    """ YOLOv8 speed/memory/FLOPs profiler
+    """
+    YOLOv8 speed/memory/FLOPs profiler
+
    Usage:
        input = torch.randn(16, 3, 640, 640)
        m1 = lambda x: x * torch.sigmoid(x)
--- a/ultralytics/yolo/v8/init.py
+++ b/ultralytics/yolo/v8/init.py
@ -1,5 +1,5 @@
 # Ultralytics YOLO 🚀, GPL-3.0 license

-from ultralytics.yolo.v8 import classify, detect, segment
+from ultralytics.yolo.v8 import classify, detect, pose, segment

-__all__ = 'classify', 'segment', 'detect'
+__all__ = 'classify', 'segment', 'detect', 'pose'
--- a/ultralytics/yolo/v8/detect/train.py
+++ b/ultralytics/yolo/v8/detect/train.py
@ -41,7 +41,7 @@ class DetectionTrainer(BaseTrainer):
                                 shuffle=mode == 'train',
                                 seed=self.args.seed)[0] if self.args.v5loader else \
            build_dataloader(self.args, batch_size, img_path=dataset_path, stride=gs, rank=rank, mode=mode,
-                             rect=mode == 'val', names=self.data['names'])[0]
+                             rect=mode == 'val', data_info=self.data)[0]

    def preprocess_batch(self, batch):
        batch['img'] = batch['img'].to(self.device, non_blocking=True).float() / 255
--- a/ultralytics/yolo/v8/detect/val.py
+++ b/ultralytics/yolo/v8/detect/val.py
@ -41,7 +41,7 @@ class DetectionValidator(BaseValidator):

    def init_metrics(self, model):
        val = self.data.get(self.args.split, '')  # validation path
-        self.is_coco = isinstance(val, str) and val.endswith(f'coco{os.sep}val2017.txt')  # is COCO dataset
+        self.is_coco = isinstance(val, str) and 'coco' in val and val.endswith(f'{os.sep}val2017.txt')  # is COCO
        self.class_map = ops.coco80_to_coco91_class() if self.is_coco else list(range(1000))
        self.args.save_json |= self.is_coco and not self.training  # run on final val if training COCO
        self.names = model.names
@ -179,7 +179,7 @@ class DetectionValidator(BaseValidator):
                                 prefix=colorstr(f'{self.args.mode}: '),
                                 shuffle=False,
                                 seed=self.args.seed)[0] if self.args.v5loader else \
-            build_dataloader(self.args, batch_size, img_path=dataset_path, stride=gs, names=self.data['names'],
+            build_dataloader(self.args, batch_size, img_path=dataset_path, stride=gs, data_info=self.data,
                             mode='val')[0]

    def plot_val_samples(self, batch, ni):
--- a/ultralytics/yolo/v8/pose/init.py
+++ b/ultralytics/yolo/v8/pose/init.py
@ -0,0 +1,7 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+
+from .predict import PosePredictor, predict
+from .train import PoseTrainer, train
+from .val import PoseValidator, val
+
+__all__ = 'PoseTrainer', 'train', 'PoseValidator', 'val', 'PosePredictor', 'predict'
--- a/ultralytics/yolo/v8/pose/predict.py
+++ b/ultralytics/yolo/v8/pose/predict.py
@ -0,0 +1,103 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+
+from ultralytics.yolo.engine.results import Results
+from ultralytics.yolo.utils import DEFAULT_CFG, ROOT, ops
+from ultralytics.yolo.utils.plotting import colors, save_one_box
+from ultralytics.yolo.v8.detect.predict import DetectionPredictor
+
+
+class PosePredictor(DetectionPredictor):
+
+    def postprocess(self, preds, img, orig_img):
+        preds = ops.non_max_suppression(preds,
+                                        self.args.conf,
+                                        self.args.iou,
+                                        agnostic=self.args.agnostic_nms,
+                                        max_det=self.args.max_det,
+                                        classes=self.args.classes,
+                                        nc=len(self.model.names))
+
+        results = []
+        for i, pred in enumerate(preds):
+            orig_img = orig_img[i] if isinstance(orig_img, list) else orig_img
+            shape = orig_img.shape
+            pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], shape).round()
+            pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:]
+            pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, shape)
+            path, _, _, _, _ = self.batch
+            img_path = path[i] if isinstance(path, list) else path
+            results.append(
+                Results(orig_img=orig_img,
+                        path=img_path,
+                        names=self.model.names,
+                        boxes=pred[:, :6],
+                        keypoints=pred_kpts))
+        return results
+
+    def write_results(self, idx, results, batch):
+        p, im, im0 = batch
+        log_string = ''
+        if len(im.shape) == 3:
+            im = im[None]  # expand for batch dim
+        self.seen += 1
+        imc = im0.copy() if self.args.save_crop else im0
+        if self.source_type.webcam or self.source_type.from_img:  # batch_size >= 1
+            log_string += f'{idx}: '
+            frame = self.dataset.count
+        else:
+            frame = getattr(self.dataset, 'frame', 0)
+        self.data_path = p
+        self.txt_path = str(self.save_dir / 'labels' / p.stem) + ('' if self.dataset.mode == 'image' else f'_{frame}')
+        log_string += '%gx%g ' % im.shape[2:]  # print string
+        self.annotator = self.get_annotator(im0)
+
+        det = results[idx].boxes  # TODO: make boxes inherit from tensors
+        if len(det) == 0:
+            return f'{log_string}(no detections), '
+        for c in det.cls.unique():
+            n = (det.cls == c).sum()  # detections per class
+            log_string += f"{n} {self.model.names[int(c)]}{'s' * (n > 1)}, "
+
+        kpts = reversed(results[idx].keypoints)
+        for k in kpts:
+            self.annotator.kpts(k, shape=results[idx].orig_shape)
+
+        # write
+        for j, d in enumerate(reversed(det)):
+            c, conf, id = int(d.cls), float(d.conf), None if d.id is None else int(d.id.item())
+            if self.args.save_txt:  # Write to file
+                kpt = (kpts[j][:, :2] / d.orig_shape[[1, 0]]).reshape(-1).tolist()
+                box = d.xywhn.view(-1).tolist()
+                line = (c, *box, *kpt) + (conf, ) * self.args.save_conf + (() if id is None else (id, ))
+                with open(f'{self.txt_path}.txt', 'a') as f:
+                    f.write(('%g ' * len(line)).rstrip() % line + '\n')
+            if self.args.save or self.args.show:  # Add bbox to image
+                name = ('' if id is None else f'id:{id} ') + self.model.names[c]
+                label = (f'{name} {conf:.2f}' if self.args.show_conf else name) if self.args.show_labels else None
+                if self.args.boxes:
+                    self.annotator.box_label(d.xyxy.squeeze(), label, color=colors(c, True))
+            if self.args.save_crop:
+                save_one_box(d.xyxy,
+                             imc,
+                             file=self.save_dir / 'crops' / self.model.model.names[c] / f'{self.data_path.stem}.jpg',
+                             BGR=True)
+
+        return log_string
+
+
+def predict(cfg=DEFAULT_CFG, use_python=False):
+    model = cfg.model or 'yolov8n-pose.pt'
+    source = cfg.source if cfg.source is not None else ROOT / 'assets' if (ROOT / 'assets').exists() \
+        else 'https://ultralytics.com/images/bus.jpg'
+
+    args = dict(model=model, source=source)
+    if use_python:
+        from ultralytics import YOLO
+        YOLO(model)(**args)
+    else:
+        predictor = PosePredictor(overrides=args)
+        predictor.predict_cli()
+
+
+if __name__ == '__main__':
+    predict()
--- a/ultralytics/yolo/v8/pose/train.py
+++ b/ultralytics/yolo/v8/pose/train.py
@ -0,0 +1,170 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+
+from copy import copy
+
+import torch
+import torch.nn as nn
+
+from ultralytics.nn.tasks import PoseModel
+from ultralytics.yolo import v8
+from ultralytics.yolo.utils import DEFAULT_CFG
+from ultralytics.yolo.utils.loss import KeypointLoss
+from ultralytics.yolo.utils.metrics import OKS_SIGMA
+from ultralytics.yolo.utils.ops import xyxy2xywh
+from ultralytics.yolo.utils.plotting import plot_images, plot_results
+from ultralytics.yolo.utils.tal import make_anchors
+from ultralytics.yolo.utils.torch_utils import de_parallel
+from ultralytics.yolo.v8.detect.train import Loss
+
+
+# BaseTrainer python usage
+class PoseTrainer(v8.detect.DetectionTrainer):
+
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None):
+        if overrides is None:
+            overrides = {}
+        overrides['task'] = 'pose'
+        super().__init__(cfg, overrides)
+
+    def get_model(self, cfg=None, weights=None, verbose=True):
+        model = PoseModel(cfg, ch=3, nc=self.data['nc'], data_kpt_shape=self.data['kpt_shape'], verbose=verbose)
+        if weights:
+            model.load(weights)
+
+        return model
+
+    def set_model_attributes(self):
+        super().set_model_attributes()
+        self.model.kpt_shape = self.data['kpt_shape']
+
+    def get_validator(self):
+        self.loss_names = 'box_loss', 'pose_loss', 'kobj_loss', 'cls_loss', 'dfl_loss'
+        return v8.pose.PoseValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
+
+    def criterion(self, preds, batch):
+        if not hasattr(self, 'compute_loss'):
+            self.compute_loss = PoseLoss(de_parallel(self.model))
+        return self.compute_loss(preds, batch)
+
+    def plot_training_samples(self, batch, ni):
+        images = batch['img']
+        kpts = batch['keypoints']
+        cls = batch['cls'].squeeze(-1)
+        bboxes = batch['bboxes']
+        paths = batch['im_file']
+        batch_idx = batch['batch_idx']
+        plot_images(images,
+                    batch_idx,
+                    cls,
+                    bboxes,
+                    kpts=kpts,
+                    paths=paths,
+                    fname=self.save_dir / f'train_batch{ni}.jpg')
+
+    def plot_metrics(self):
+        plot_results(file=self.csv, pose=True)  # save results.png
+
+
+# Criterion class for computing training losses
+class PoseLoss(Loss):
+
+    def __init__(self, model):  # model must be de-paralleled
+        super().__init__(model)
+        self.kpt_shape = model.model[-1].kpt_shape
+        self.bce_pose = nn.BCEWithLogitsLoss()
+        is_pose = self.kpt_shape == [17, 3]
+        nkpt = self.kpt_shape[0]  # number of keypoints
+        sigmas = torch.from_numpy(OKS_SIGMA).to(self.device) if is_pose else torch.ones(nkpt, device=self.device) / nkpt
+        self.keypoint_loss = KeypointLoss(sigmas=sigmas)
+
+    def __call__(self, preds, batch):
+        loss = torch.zeros(5, device=self.device)  # box, cls, dfl, kpt_location, kpt_visibility
+        feats, pred_kpts = preds if isinstance(preds[0], list) else preds[1]
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+
+        # b, grids, ..
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_kpts = pred_kpts.permute(0, 2, 1).contiguous()
+
+        dtype = pred_scores.dtype
+        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
+        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
+
+        # targets
+        batch_size = pred_scores.shape[0]
+        batch_idx = batch['batch_idx'].view(-1, 1)
+        targets = torch.cat((batch_idx, batch['cls'].view(-1, 1), batch['bboxes']), 1)
+        targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
+        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
+        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+
+        # pboxes
+        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
+        pred_kpts = self.kpts_decode(anchor_points, pred_kpts.view(batch_size, -1, *self.kpt_shape))  # (b, h*w, 17, 3)
+
+        _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner(
+            pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt)
+
+        target_scores_sum = max(target_scores.sum(), 1)
+
+        # cls loss
+        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
+        loss[3] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
+
+        # bbox loss
+        if fg_mask.sum():
+            target_bboxes /= stride_tensor
+            loss[0], loss[4] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores,
+                                              target_scores_sum, fg_mask)
+            keypoints = batch['keypoints'].to(self.device).float().clone()
+            keypoints[..., 0] *= imgsz[1]
+            keypoints[..., 1] *= imgsz[0]
+            for i in range(batch_size):
+                if fg_mask[i].sum():
+                    idx = target_gt_idx[i][fg_mask[i]]
+                    gt_kpt = keypoints[batch_idx.view(-1) == i][idx]  # (n, 51)
+                    gt_kpt[..., 0] /= stride_tensor[fg_mask[i]]
+                    gt_kpt[..., 1] /= stride_tensor[fg_mask[i]]
+                    area = xyxy2xywh(target_bboxes[i][fg_mask[i]])[:, 2:].prod(1, keepdim=True)
+                    pred_kpt = pred_kpts[i][fg_mask[i]]
+                    kpt_mask = gt_kpt[..., 2] != 0
+                    loss[1] += self.keypoint_loss(pred_kpt, gt_kpt, kpt_mask, area)  # pose loss
+                    # kpt_score loss
+                    if pred_kpt.shape[-1] == 3:
+                        loss[2] += self.bce_pose(pred_kpt[..., 2], kpt_mask.float())  # keypoint obj loss
+
+        loss[0] *= self.hyp.box  # box gain
+        loss[1] *= self.hyp.pose / batch_size  # pose gain
+        loss[2] *= self.hyp.kobj / batch_size  # kobj gain
+        loss[3] *= self.hyp.cls  # cls gain
+        loss[4] *= self.hyp.dfl  # dfl gain
+
+        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
+
+    def kpts_decode(self, anchor_points, pred_kpts):
+        y = pred_kpts.clone()
+        y[..., :2] *= 2.0
+        y[..., 0] += anchor_points[:, [0]] - 0.5
+        y[..., 1] += anchor_points[:, [1]] - 0.5
+        return y
+
+
+def train(cfg=DEFAULT_CFG, use_python=False):
+    model = cfg.model or 'yolov8n-pose.yaml'
+    data = cfg.data or 'coco8-pose.yaml'
+    device = cfg.device if cfg.device is not None else ''
+
+    args = dict(model=model, data=data, device=device)
+    if use_python:
+        from ultralytics import YOLO
+        YOLO(model).train(**args)
+    else:
+        trainer = PoseTrainer(overrides=args)
+        trainer.train()
+
+
+if __name__ == '__main__':
+    train()
--- a/ultralytics/yolo/v8/pose/val.py
+++ b/ultralytics/yolo/v8/pose/val.py
@ -0,0 +1,213 @@
+# Ultralytics YOLO 🚀, GPL-3.0 license
+
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from ultralytics.yolo.utils import DEFAULT_CFG, LOGGER, ops
+from ultralytics.yolo.utils.checks import check_requirements
+from ultralytics.yolo.utils.metrics import OKS_SIGMA, PoseMetrics, box_iou, kpt_iou
+from ultralytics.yolo.utils.plotting import output_to_target, plot_images
+from ultralytics.yolo.v8.detect import DetectionValidator
+
+
+class PoseValidator(DetectionValidator):
+
+    def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None):
+        super().__init__(dataloader, save_dir, pbar, args)
+        self.args.task = 'pose'
+        self.metrics = PoseMetrics(save_dir=self.save_dir)
+
+    def preprocess(self, batch):
+        batch = super().preprocess(batch)
+        batch['keypoints'] = batch['keypoints'].to(self.device).float()
+        return batch
+
+    def get_desc(self):
+        return ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Pose(P',
+                                         'R', 'mAP50', 'mAP50-95)')
+
+    def postprocess(self, preds):
+        preds = ops.non_max_suppression(preds,
+                                        self.args.conf,
+                                        self.args.iou,
+                                        labels=self.lb,
+                                        multi_label=True,
+                                        agnostic=self.args.single_cls,
+                                        max_det=self.args.max_det,
+                                        nc=self.nc)
+        return preds
+
+    def init_metrics(self, model):
+        super().init_metrics(model)
+        self.kpt_shape = self.data['kpt_shape']
+        is_pose = self.kpt_shape == [17, 3]
+        nkpt = self.kpt_shape[0]
+        self.sigma = OKS_SIGMA if is_pose else np.ones(nkpt) / nkpt
+
+    def update_metrics(self, preds, batch):
+        # Metrics
+        for si, pred in enumerate(preds):
+            idx = batch['batch_idx'] == si
+            cls = batch['cls'][idx]
+            bbox = batch['bboxes'][idx]
+            kpts = batch['keypoints'][idx]
+            nl, npr = cls.shape[0], pred.shape[0]  # number of labels, predictions
+            nk = kpts.shape[1]  # number of keypoints
+            shape = batch['ori_shape'][si]
+            correct_kpts = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
+            correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
+            self.seen += 1
+
+            if npr == 0:
+                if nl:
+                    self.stats.append((correct_bboxes, correct_kpts, *torch.zeros(
+                        (2, 0), device=self.device), cls.squeeze(-1)))
+                    if self.args.plots:
+                        self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
+                continue
+
+            # Predictions
+            if self.args.single_cls:
+                pred[:, 5] = 0
+            predn = pred.clone()
+            ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
+                            ratio_pad=batch['ratio_pad'][si])  # native-space pred
+            pred_kpts = predn[:, 6:].view(npr, nk, -1)
+            ops.scale_coords(batch['img'][si].shape[1:], pred_kpts, shape, ratio_pad=batch['ratio_pad'][si])
+
+            # Evaluate
+            if nl:
+                height, width = batch['img'].shape[2:]
+                tbox = ops.xywh2xyxy(bbox) * torch.tensor(
+                    (width, height, width, height), device=self.device)  # target boxes
+                ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
+                                ratio_pad=batch['ratio_pad'][si])  # native-space labels
+                tkpts = kpts.clone()
+                tkpts[..., 0] *= width
+                tkpts[..., 1] *= height
+                tkpts = ops.scale_coords(batch['img'][si].shape[1:], tkpts, shape, ratio_pad=batch['ratio_pad'][si])
+                labelsn = torch.cat((cls, tbox), 1)  # native-space labels
+                correct_bboxes = self._process_batch(predn[:, :6], labelsn)
+                correct_kpts = self._process_batch(predn[:, :6], labelsn, pred_kpts, tkpts)
+                if self.args.plots:
+                    self.confusion_matrix.process_batch(predn, labelsn)
+
+            # Append correct_masks, correct_boxes, pconf, pcls, tcls
+            self.stats.append((correct_bboxes, correct_kpts, pred[:, 4], pred[:, 5], cls.squeeze(-1)))
+
+            # Save
+            if self.args.save_json:
+                self.pred_to_json(predn, batch['im_file'][si])
+            # if self.args.save_txt:
+            #    save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt')
+
+    def _process_batch(self, detections, labels, pred_kpts=None, gt_kpts=None):
+        """
+        Return correct prediction matrix
+        Arguments:
+            detections (array[N, 6]), x1, y1, x2, y2, conf, class
+            labels (array[M, 5]), class, x1, y1, x2, y2
+            pred_kpts (array[N, 51]), 51 = 17 * 3
+            gt_kpts (array[N, 51])
+        Returns:
+            correct (array[N, 10]), for 10 IoU levels
+        """
+        if pred_kpts is not None and gt_kpts is not None:
+            # `0.53` is from https://github.com/jin-s13/xtcocoapi/blob/master/xtcocotools/cocoeval.py#L384
+            area = ops.xyxy2xywh(labels[:, 1:])[:, 2:].prod(1) * 0.53
+            iou = kpt_iou(gt_kpts, pred_kpts, sigma=self.sigma, area=area)
+        else:  # boxes
+            iou = box_iou(labels[:, 1:], detections[:, :4])
+
+        correct = np.zeros((detections.shape[0], self.iouv.shape[0])).astype(bool)
+        correct_class = labels[:, 0:1] == detections[:, 5]
+        for i in range(len(self.iouv)):
+            x = torch.where((iou >= self.iouv[i]) & correct_class)  # IoU > threshold and classes match
+            if x[0].shape[0]:
+                matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]),
+                                    1).cpu().numpy()  # [label, detect, iou]
+                if x[0].shape[0] > 1:
+                    matches = matches[matches[:, 2].argsort()[::-1]]
+                    matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+                    # matches = matches[matches[:, 2].argsort()[::-1]]
+                    matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+                correct[matches[:, 1].astype(int), i] = True
+        return torch.tensor(correct, dtype=torch.bool, device=detections.device)
+
+    def plot_val_samples(self, batch, ni):
+        plot_images(batch['img'],
+                    batch['batch_idx'],
+                    batch['cls'].squeeze(-1),
+                    batch['bboxes'],
+                    kpts=batch['keypoints'],
+                    paths=batch['im_file'],
+                    fname=self.save_dir / f'val_batch{ni}_labels.jpg',
+                    names=self.names)
+
+    def plot_predictions(self, batch, preds, ni):
+        pred_kpts = torch.cat([p[:, 6:].view(-1, *self.kpt_shape)[:15] for p in preds], 0)
+        plot_images(batch['img'],
+                    *output_to_target(preds, max_det=15),
+                    kpts=pred_kpts,
+                    paths=batch['im_file'],
+                    fname=self.save_dir / f'val_batch{ni}_pred.jpg',
+                    names=self.names)  # pred
+
+    def pred_to_json(self, predn, filename):
+        stem = Path(filename).stem
+        image_id = int(stem) if stem.isnumeric() else stem
+        box = ops.xyxy2xywh(predn[:, :4])  # xywh
+        box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
+        for p, b in zip(predn.tolist(), box.tolist()):
+            self.jdict.append({
+                'image_id': image_id,
+                'category_id': self.class_map[int(p[5])],
+                'bbox': [round(x, 3) for x in b],
+                'keypoints': p[6:],
+                'score': round(p[4], 5)})
+
+    def eval_json(self, stats):
+        if self.args.save_json and self.is_coco and len(self.jdict):
+            anno_json = self.data['path'] / 'annotations/person_keypoints_val2017.json'  # annotations
+            pred_json = self.save_dir / 'predictions.json'  # predictions
+            LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...')
+            try:  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
+                check_requirements('pycocotools>=2.0.6')
+                from pycocotools.coco import COCO  # noqa
+                from pycocotools.cocoeval import COCOeval  # noqa
+
+                for x in anno_json, pred_json:
+                    assert x.is_file(), f'{x} file not found'
+                anno = COCO(str(anno_json))  # init annotations api
+                pred = anno.loadRes(str(pred_json))  # init predictions api (must pass string, not Path)
+                for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'keypoints')]):
+                    if self.is_coco:
+                        eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files]  # im to eval
+                    eval.evaluate()
+                    eval.accumulate()
+                    eval.summarize()
+                    idx = i * 4 + 2
+                    stats[self.metrics.keys[idx + 1]], stats[
+                        self.metrics.keys[idx]] = eval.stats[:2]  # update mAP50-95 and mAP50
+            except Exception as e:
+                LOGGER.warning(f'pycocotools unable to run: {e}')
+        return stats
+
+
+def val(cfg=DEFAULT_CFG, use_python=False):
+    model = cfg.model or 'yolov8n-pose.pt'
+    data = cfg.data or 'coco128-pose.yaml'
+
+    args = dict(model=model, data=data)
+    if use_python:
+        from ultralytics import YOLO
+        YOLO(model).val(**args)
+    else:
+        validator = PoseValidator(args=args)
+        validator(model=args['model'])
+
+
+if __name__ == '__main__':
+    val()
--- a/ultralytics/yolo/v8/segment/val.py
+++ b/ultralytics/yolo/v8/segment/val.py
@ -65,7 +65,7 @@ class SegmentationValidator(DetectionValidator):

            if npr == 0:
                if nl:
-                    self.stats.append((correct_masks, correct_bboxes, *torch.zeros(
+                    self.stats.append((correct_bboxes, correct_masks, *torch.zeros(
                        (2, 0), device=self.device), cls.squeeze(-1)))
                    if self.args.plots:
                        self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
@ -103,7 +103,7 @@ class SegmentationValidator(DetectionValidator):
                    self.confusion_matrix.process_batch(predn, labelsn)

            # Append correct_masks, correct_boxes, pconf, pcls, tcls
-            self.stats.append((correct_masks, correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1)))
+            self.stats.append((correct_bboxes, correct_masks, pred[:, 4], pred[:, 5], cls.squeeze(-1)))

            pred_masks = torch.as_tensor(pred_masks, dtype=torch.uint8)
            if self.args.plots and self.batch_i < 3:
@ -220,8 +220,7 @@ class SegmentationValidator(DetectionValidator):
                pred = anno.loadRes(str(pred_json))  # init predictions api (must pass string, not Path)
                for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'segm')]):
                    if self.is_coco:
-                        eval.params.imgIds = [int(Path(x).stem)
-                                              for x in self.dataloader.dataset.im_files]  # images to eval
+                        eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files]  # im to eval
                    eval.evaluate()
                    eval.accumulate()
                    eval.summarize()