ultralytics 8.0.136 refactor and simplify package (#3748)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2023-07-16 23:47:45 +08:00
parent 8ebe94d1e9
commit 620f3eb218
383 changed files with 4213 additions and 4646 deletions
--- a/ultralytics/data/init.py
+++ b/ultralytics/data/init.py
@ -0,0 +1,8 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from .base import BaseDataset
+from .build import build_dataloader, build_yolo_dataset, load_inference_source
+from .dataset import ClassificationDataset, SemanticDataset, YOLODataset
+
+__all__ = ('BaseDataset', 'ClassificationDataset', 'SemanticDataset', 'YOLODataset', 'build_yolo_dataset',
+           'build_dataloader', 'load_inference_source')
--- a/ultralytics/data/annotator.py
+++ b/ultralytics/data/annotator.py
@ -0,0 +1,39 @@
+from pathlib import Path
+
+from ultralytics import SAM, YOLO
+
+
+def auto_annotate(data, det_model='yolov8x.pt', sam_model='sam_b.pt', device='', output_dir=None):
+    """
+    Automatically annotates images using a YOLO object detection model and a SAM segmentation model.
+    Args:
+        data (str): Path to a folder containing images to be annotated.
+        det_model (str, optional): Pre-trained YOLO detection model. Defaults to 'yolov8x.pt'.
+        sam_model (str, optional): Pre-trained SAM segmentation model. Defaults to 'sam_b.pt'.
+        device (str, optional): Device to run the models on. Defaults to an empty string (CPU or GPU, if available).
+        output_dir (str | None | optional): Directory to save the annotated results.
+            Defaults to a 'labels' folder in the same directory as 'data'.
+    """
+    det_model = YOLO(det_model)
+    sam_model = SAM(sam_model)
+
+    if not output_dir:
+        output_dir = Path(str(data)).parent / 'labels'
+    Path(output_dir).mkdir(exist_ok=True, parents=True)
+
+    det_results = det_model(data, stream=True, device=device)
+
+    for result in det_results:
+        boxes = result.boxes.xyxy  # Boxes object for bbox outputs
+        class_ids = result.boxes.cls.int().tolist()  # noqa
+        if len(class_ids):
+            sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device)
+            segments = sam_results[0].masks.xyn  # noqa
+
+            with open(str(Path(output_dir) / Path(result.path).stem) + '.txt', 'w') as f:
+                for i in range(len(segments)):
+                    s = segments[i]
+                    if len(s) == 0:
+                        continue
+                    segment = map(str, segments[i].reshape(-1).tolist())
+                    f.write(f'{class_ids[i]} ' + ' '.join(segment) + '\n')
--- a/ultralytics/data/augment.py
+++ b/ultralytics/data/augment.py
@ -0,0 +1,906 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import math
+import random
+from copy import deepcopy
+
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as T
+
+from ultralytics.utils import LOGGER, colorstr
+from ultralytics.utils.checks import check_version
+from ultralytics.utils.instance import Instances
+from ultralytics.utils.metrics import bbox_ioa
+from ultralytics.utils.ops import segment2box
+
+from .utils import polygons2masks, polygons2masks_overlap
+
+POSE_FLIPLR_INDEX = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+
+
+# TODO: we might need a BaseTransform to make all these augments be compatible with both classification and semantic
+class BaseTransform:
+
+    def __init__(self) -> None:
+        pass
+
+    def apply_image(self, labels):
+        """Applies image transformation to labels."""
+        pass
+
+    def apply_instances(self, labels):
+        """Applies transformations to input 'labels' and returns object instances."""
+        pass
+
+    def apply_semantic(self, labels):
+        """Applies semantic segmentation to an image."""
+        pass
+
+    def __call__(self, labels):
+        """Applies label transformations to an image, instances and semantic masks."""
+        self.apply_image(labels)
+        self.apply_instances(labels)
+        self.apply_semantic(labels)
+
+
+class Compose:
+
+    def __init__(self, transforms):
+        """Initializes the Compose object with a list of transforms."""
+        self.transforms = transforms
+
+    def __call__(self, data):
+        """Applies a series of transformations to input data."""
+        for t in self.transforms:
+            data = t(data)
+        return data
+
+    def append(self, transform):
+        """Appends a new transform to the existing list of transforms."""
+        self.transforms.append(transform)
+
+    def tolist(self):
+        """Converts list of transforms to a standard Python list."""
+        return self.transforms
+
+    def __repr__(self):
+        """Return string representation of object."""
+        format_string = f'{self.__class__.__name__}('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
+
+
+class BaseMixTransform:
+    """This implementation is from mmyolo."""
+
+    def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
+        self.dataset = dataset
+        self.pre_transform = pre_transform
+        self.p = p
+
+    def __call__(self, labels):
+        """Applies pre-processing transforms and mixup/mosaic transforms to labels data."""
+        if random.uniform(0, 1) > self.p:
+            return labels
+
+        # Get index of one or three other images
+        indexes = self.get_indexes()
+        if isinstance(indexes, int):
+            indexes = [indexes]
+
+        # Get images information will be used for Mosaic or MixUp
+        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
+
+        if self.pre_transform is not None:
+            for i, data in enumerate(mix_labels):
+                mix_labels[i] = self.pre_transform(data)
+        labels['mix_labels'] = mix_labels
+
+        # Mosaic or MixUp
+        labels = self._mix_transform(labels)
+        labels.pop('mix_labels', None)
+        return labels
+
+    def _mix_transform(self, labels):
+        """Applies MixUp or Mosaic augmentation to the label dictionary."""
+        raise NotImplementedError
+
+    def get_indexes(self):
+        """Gets a list of shuffled indexes for mosaic augmentation."""
+        raise NotImplementedError
+
+
+class Mosaic(BaseMixTransform):
+    """
+    Mosaic augmentation.
+
+    This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
+    The augmentation is applied to a dataset with a given probability.
+
+    Attributes:
+        dataset: The dataset on which the mosaic augmentation is applied.
+        imgsz (int, optional): Image size (height and width) after mosaic pipeline of a single image. Default to 640.
+        p (float, optional): Probability of applying the mosaic augmentation. Must be in the range 0-1. Default to 1.0.
+        n (int, optional): The grid size, either 4 (for 2x2) or 9 (for 3x3).
+    """
+
+    def __init__(self, dataset, imgsz=640, p=1.0, n=4):
+        """Initializes the object with a dataset, image size, probability, and border."""
+        assert 0 <= p <= 1.0, f'The probability should be in range [0, 1], but got {p}.'
+        assert n in (4, 9), 'grid must be equal to 4 or 9.'
+        super().__init__(dataset=dataset, p=p)
+        self.dataset = dataset
+        self.imgsz = imgsz
+        self.border = (-imgsz // 2, -imgsz // 2)  # width, height
+        self.n = n
+
+    def get_indexes(self, buffer=True):
+        """Return a list of random indexes from the dataset."""
+        if buffer:  # select images from buffer
+            return random.choices(list(self.dataset.buffer), k=self.n - 1)
+        else:  # select any images
+            return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)]
+
+    def _mix_transform(self, labels):
+        """Apply mixup transformation to the input image and labels."""
+        assert labels.get('rect_shape', None) is None, 'rect and mosaic are mutually exclusive.'
+        assert len(labels.get('mix_labels', [])), 'There are no other images for mosaic augment.'
+        return self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels)
+
+    def _mosaic4(self, labels):
+        """Create a 2x2 image mosaic."""
+        mosaic_labels = []
+        s = self.imgsz
+        yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.border)  # mosaic center x, y
+        for i in range(4):
+            labels_patch = labels if i == 0 else labels['mix_labels'][i - 1]
+            # Load image
+            img = labels_patch['img']
+            h, w = labels_patch.pop('resized_shape')
+
+            # Place img in img4
+            if i == 0:  # top left
+                img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+            img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+            padw = x1a - x1b
+            padh = y1a - y1b
+
+            labels_patch = self._update_labels(labels_patch, padw, padh)
+            mosaic_labels.append(labels_patch)
+        final_labels = self._cat_labels(mosaic_labels)
+        final_labels['img'] = img4
+        return final_labels
+
+    def _mosaic9(self, labels):
+        """Create a 3x3 image mosaic."""
+        mosaic_labels = []
+        s = self.imgsz
+        hp, wp = -1, -1  # height, width previous
+        for i in range(9):
+            labels_patch = labels if i == 0 else labels['mix_labels'][i - 1]
+            # Load image
+            img = labels_patch['img']
+            h, w = labels_patch.pop('resized_shape')
+
+            # Place img in img9
+            if i == 0:  # center
+                img9 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+                h0, w0 = h, w
+                c = s, s, s + w, s + h  # xmin, ymin, xmax, ymax (base) coordinates
+            elif i == 1:  # top
+                c = s, s - h, s + w, s
+            elif i == 2:  # top right
+                c = s + wp, s - h, s + wp + w, s
+            elif i == 3:  # right
+                c = s + w0, s, s + w0 + w, s + h
+            elif i == 4:  # bottom right
+                c = s + w0, s + hp, s + w0 + w, s + hp + h
+            elif i == 5:  # bottom
+                c = s + w0 - w, s + h0, s + w0, s + h0 + h
+            elif i == 6:  # bottom left
+                c = s + w0 - wp - w, s + h0, s + w0 - wp, s + h0 + h
+            elif i == 7:  # left
+                c = s - w, s + h0 - h, s, s + h0
+            elif i == 8:  # top left
+                c = s - w, s + h0 - hp - h, s, s + h0 - hp
+
+            padw, padh = c[:2]
+            x1, y1, x2, y2 = (max(x, 0) for x in c)  # allocate coords
+
+            # Image
+            img9[y1:y2, x1:x2] = img[y1 - padh:, x1 - padw:]  # img9[ymin:ymax, xmin:xmax]
+            hp, wp = h, w  # height, width previous for next iteration
+
+            # Labels assuming imgsz*2 mosaic size
+            labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
+            mosaic_labels.append(labels_patch)
+        final_labels = self._cat_labels(mosaic_labels)
+
+        final_labels['img'] = img9[-self.border[0]:self.border[0], -self.border[1]:self.border[1]]
+        return final_labels
+
+    @staticmethod
+    def _update_labels(labels, padw, padh):
+        """Update labels."""
+        nh, nw = labels['img'].shape[:2]
+        labels['instances'].convert_bbox(format='xyxy')
+        labels['instances'].denormalize(nw, nh)
+        labels['instances'].add_padding(padw, padh)
+        return labels
+
+    def _cat_labels(self, mosaic_labels):
+        """Return labels with mosaic border instances clipped."""
+        if len(mosaic_labels) == 0:
+            return {}
+        cls = []
+        instances = []
+        imgsz = self.imgsz * 2  # mosaic imgsz
+        for labels in mosaic_labels:
+            cls.append(labels['cls'])
+            instances.append(labels['instances'])
+        final_labels = {
+            'im_file': mosaic_labels[0]['im_file'],
+            'ori_shape': mosaic_labels[0]['ori_shape'],
+            'resized_shape': (imgsz, imgsz),
+            'cls': np.concatenate(cls, 0),
+            'instances': Instances.concatenate(instances, axis=0),
+            'mosaic_border': self.border}  # final_labels
+        final_labels['instances'].clip(imgsz, imgsz)
+        good = final_labels['instances'].remove_zero_area_boxes()
+        final_labels['cls'] = final_labels['cls'][good]
+        return final_labels
+
+
+class MixUp(BaseMixTransform):
+
+    def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
+        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
+
+    def get_indexes(self):
+        """Get a random index from the dataset."""
+        return random.randint(0, len(self.dataset) - 1)
+
+    def _mix_transform(self, labels):
+        """Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf."""
+        r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
+        labels2 = labels['mix_labels'][0]
+        labels['img'] = (labels['img'] * r + labels2['img'] * (1 - r)).astype(np.uint8)
+        labels['instances'] = Instances.concatenate([labels['instances'], labels2['instances']], axis=0)
+        labels['cls'] = np.concatenate([labels['cls'], labels2['cls']], 0)
+        return labels
+
+
+class RandomPerspective:
+
+    def __init__(self,
+                 degrees=0.0,
+                 translate=0.1,
+                 scale=0.5,
+                 shear=0.0,
+                 perspective=0.0,
+                 border=(0, 0),
+                 pre_transform=None):
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.perspective = perspective
+        # Mosaic border
+        self.border = border
+        self.pre_transform = pre_transform
+
+    def affine_transform(self, img, border):
+        """Center."""
+        C = np.eye(3, dtype=np.float32)
+
+        C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
+        C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
+
+        # Perspective
+        P = np.eye(3, dtype=np.float32)
+        P[2, 0] = random.uniform(-self.perspective, self.perspective)  # x perspective (about y)
+        P[2, 1] = random.uniform(-self.perspective, self.perspective)  # y perspective (about x)
+
+        # Rotation and Scale
+        R = np.eye(3, dtype=np.float32)
+        a = random.uniform(-self.degrees, self.degrees)
+        # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+        s = random.uniform(1 - self.scale, 1 + self.scale)
+        # s = 2 ** random.uniform(-scale, scale)
+        R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+        # Shear
+        S = np.eye(3, dtype=np.float32)
+        S[0, 1] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # x shear (deg)
+        S[1, 0] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # y shear (deg)
+
+        # Translation
+        T = np.eye(3, dtype=np.float32)
+        T[0, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[0]  # x translation (pixels)
+        T[1, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[1]  # y translation (pixels)
+
+        # Combined rotation matrix
+        M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+        # Affine image
+        if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+            if self.perspective:
+                img = cv2.warpPerspective(img, M, dsize=self.size, borderValue=(114, 114, 114))
+            else:  # affine
+                img = cv2.warpAffine(img, M[:2], dsize=self.size, borderValue=(114, 114, 114))
+        return img, M, s
+
+    def apply_bboxes(self, bboxes, M):
+        """
+        Apply affine to bboxes only.
+
+        Args:
+            bboxes (ndarray): list of bboxes, xyxy format, with shape (num_bboxes, 4).
+            M (ndarray): affine matrix.
+
+        Returns:
+            new_bboxes (ndarray): bboxes after affine, [num_bboxes, 4].
+        """
+        n = len(bboxes)
+        if n == 0:
+            return bboxes
+
+        xy = np.ones((n * 4, 3), dtype=bboxes.dtype)
+        xy[:, :2] = bboxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        xy = (xy[:, :2] / xy[:, 2:3] if self.perspective else xy[:, :2]).reshape(n, 8)  # perspective rescale or affine
+
+        # Create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
+
+    def apply_segments(self, segments, M):
+        """
+        Apply affine to segments and generate new bboxes from segments.
+
+        Args:
+            segments (ndarray): list of segments, [num_samples, 500, 2].
+            M (ndarray): affine matrix.
+
+        Returns:
+            new_segments (ndarray): list of segments after affine, [num_samples, 500, 2].
+            new_bboxes (ndarray): bboxes after affine, [N, 4].
+        """
+        n, num = segments.shape[:2]
+        if n == 0:
+            return [], segments
+
+        xy = np.ones((n * num, 3), dtype=segments.dtype)
+        segments = segments.reshape(-1, 2)
+        xy[:, :2] = segments
+        xy = xy @ M.T  # transform
+        xy = xy[:, :2] / xy[:, 2:3]
+        segments = xy.reshape(n, -1, 2)
+        bboxes = np.stack([segment2box(xy, self.size[0], self.size[1]) for xy in segments], 0)
+        return bboxes, segments
+
+    def apply_keypoints(self, keypoints, M):
+        """
+        Apply affine to keypoints.
+
+        Args:
+            keypoints (ndarray): keypoints, [N, 17, 3].
+            M (ndarray): affine matrix.
+
+        Return:
+            new_keypoints (ndarray): keypoints after affine, [N, 17, 3].
+        """
+        n, nkpt = keypoints.shape[:2]
+        if n == 0:
+            return keypoints
+        xy = np.ones((n * nkpt, 3), dtype=keypoints.dtype)
+        visible = keypoints[..., 2].reshape(n * nkpt, 1)
+        xy[:, :2] = keypoints[..., :2].reshape(n * nkpt, 2)
+        xy = xy @ M.T  # transform
+        xy = xy[:, :2] / xy[:, 2:3]  # perspective rescale or affine
+        out_mask = (xy[:, 0] < 0) | (xy[:, 1] < 0) | (xy[:, 0] > self.size[0]) | (xy[:, 1] > self.size[1])
+        visible[out_mask] = 0
+        return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)
+
+    def __call__(self, labels):
+        """
+        Affine images and targets.
+
+        Args:
+            labels (dict): a dict of `bboxes`, `segments`, `keypoints`.
+        """
+        if self.pre_transform and 'mosaic_border' not in labels:
+            labels = self.pre_transform(labels)
+        labels.pop('ratio_pad', None)  # do not need ratio pad
+
+        img = labels['img']
+        cls = labels['cls']
+        instances = labels.pop('instances')
+        # Make sure the coord formats are right
+        instances.convert_bbox(format='xyxy')
+        instances.denormalize(*img.shape[:2][::-1])
+
+        border = labels.pop('mosaic_border', self.border)
+        self.size = img.shape[1] + border[1] * 2, img.shape[0] + border[0] * 2  # w, h
+        # M is affine matrix
+        # scale for func:`box_candidates`
+        img, M, scale = self.affine_transform(img, border)
+
+        bboxes = self.apply_bboxes(instances.bboxes, M)
+
+        segments = instances.segments
+        keypoints = instances.keypoints
+        # Update bboxes if there are segments.
+        if len(segments):
+            bboxes, segments = self.apply_segments(segments, M)
+
+        if keypoints is not None:
+            keypoints = self.apply_keypoints(keypoints, M)
+        new_instances = Instances(bboxes, segments, keypoints, bbox_format='xyxy', normalized=False)
+        # Clip
+        new_instances.clip(*self.size)
+
+        # Filter instances
+        instances.scale(scale_w=scale, scale_h=scale, bbox_only=True)
+        # Make the bboxes have the same scale with new_bboxes
+        i = self.box_candidates(box1=instances.bboxes.T,
+                                box2=new_instances.bboxes.T,
+                                area_thr=0.01 if len(segments) else 0.10)
+        labels['instances'] = new_instances[i]
+        labels['cls'] = cls[i]
+        labels['img'] = img
+        labels['resized_shape'] = img.shape[:2]
+        return labels
+
+    def box_candidates(self, box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16):  # box1(4,n), box2(4,n)
+        # Compute box candidates: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
+        w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+        w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+        ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
+        return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr)  # candidates
+
+
+class RandomHSV:
+
+    def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None:
+        self.hgain = hgain
+        self.sgain = sgain
+        self.vgain = vgain
+
+    def __call__(self, labels):
+        """Applies random horizontal or vertical flip to an image with a given probability."""
+        img = labels['img']
+        if self.hgain or self.sgain or self.vgain:
+            r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain] + 1  # random gains
+            hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+            dtype = img.dtype  # uint8
+
+            x = np.arange(0, 256, dtype=r.dtype)
+            lut_hue = ((x * r[0]) % 180).astype(dtype)
+            lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+            lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+            im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
+            cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+        return labels
+
+
+class RandomFlip:
+
+    def __init__(self, p=0.5, direction='horizontal', flip_idx=None) -> None:
+        assert direction in ['horizontal', 'vertical'], f'Support direction `horizontal` or `vertical`, got {direction}'
+        assert 0 <= p <= 1.0
+
+        self.p = p
+        self.direction = direction
+        self.flip_idx = flip_idx
+
+    def __call__(self, labels):
+        """Resize image and padding for detection, instance segmentation, pose."""
+        img = labels['img']
+        instances = labels.pop('instances')
+        instances.convert_bbox(format='xywh')
+        h, w = img.shape[:2]
+        h = 1 if instances.normalized else h
+        w = 1 if instances.normalized else w
+
+        # Flip up-down
+        if self.direction == 'vertical' and random.random() < self.p:
+            img = np.flipud(img)
+            instances.flipud(h)
+        if self.direction == 'horizontal' and random.random() < self.p:
+            img = np.fliplr(img)
+            instances.fliplr(w)
+            # For keypoints
+            if self.flip_idx is not None and instances.keypoints is not None:
+                instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :])
+        labels['img'] = np.ascontiguousarray(img)
+        labels['instances'] = instances
+        return labels
+
+
+class LetterBox:
+    """Resize image and padding for detection, instance segmentation, pose."""
+
+    def __init__(self, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, center=True, stride=32):
+        """Initialize LetterBox object with specific parameters."""
+        self.new_shape = new_shape
+        self.auto = auto
+        self.scaleFill = scaleFill
+        self.scaleup = scaleup
+        self.stride = stride
+        self.center = center  # Put the image in the middle or top-left
+
+    def __call__(self, labels=None, image=None):
+        """Return updated labels and image with added border."""
+        if labels is None:
+            labels = {}
+        img = labels.get('img') if image is None else image
+        shape = img.shape[:2]  # current shape [height, width]
+        new_shape = labels.pop('rect_shape', self.new_shape)
+        if isinstance(new_shape, int):
+            new_shape = (new_shape, new_shape)
+
+        # Scale ratio (new / old)
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+        if not self.scaleup:  # only scale down, do not scale up (for better val mAP)
+            r = min(r, 1.0)
+
+        # Compute padding
+        ratio = r, r  # width, height ratios
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+        if self.auto:  # minimum rectangle
+            dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride)  # wh padding
+        elif self.scaleFill:  # stretch
+            dw, dh = 0.0, 0.0
+            new_unpad = (new_shape[1], new_shape[0])
+            ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+        if self.center:
+            dw /= 2  # divide padding into 2 sides
+            dh /= 2
+        if labels.get('ratio_pad'):
+            labels['ratio_pad'] = (labels['ratio_pad'], (dw, dh))  # for evaluation
+
+        if shape[::-1] != new_unpad:  # resize
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+        top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
+        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+                                 value=(114, 114, 114))  # add border
+
+        if len(labels):
+            labels = self._update_labels(labels, ratio, dw, dh)
+            labels['img'] = img
+            labels['resized_shape'] = new_shape
+            return labels
+        else:
+            return img
+
+    def _update_labels(self, labels, ratio, padw, padh):
+        """Update labels."""
+        labels['instances'].convert_bbox(format='xyxy')
+        labels['instances'].denormalize(*labels['img'].shape[:2][::-1])
+        labels['instances'].scale(*ratio)
+        labels['instances'].add_padding(padw, padh)
+        return labels
+
+
+class CopyPaste:
+
+    def __init__(self, p=0.5) -> None:
+        self.p = p
+
+    def __call__(self, labels):
+        """Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy)."""
+        im = labels['img']
+        cls = labels['cls']
+        h, w = im.shape[:2]
+        instances = labels.pop('instances')
+        instances.convert_bbox(format='xyxy')
+        instances.denormalize(w, h)
+        if self.p and len(instances.segments):
+            n = len(instances)
+            _, w, _ = im.shape  # height, width, channels
+            im_new = np.zeros(im.shape, np.uint8)
+
+            # Calculate ioa first then select indexes randomly
+            ins_flip = deepcopy(instances)
+            ins_flip.fliplr(w)
+
+            ioa = bbox_ioa(ins_flip.bboxes, instances.bboxes)  # intersection over area, (N, M)
+            indexes = np.nonzero((ioa < 0.30).all(1))[0]  # (N, )
+            n = len(indexes)
+            for j in random.sample(list(indexes), k=round(self.p * n)):
+                cls = np.concatenate((cls, cls[[j]]), axis=0)
+                instances = Instances.concatenate((instances, ins_flip[[j]]), axis=0)
+                cv2.drawContours(im_new, instances.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
+
+            result = cv2.flip(im, 1)  # augment segments (flip left-right)
+            i = cv2.flip(im_new, 1).astype(bool)
+            im[i] = result[i]  # cv2.imwrite('debug.jpg', im)  # debug
+
+        labels['img'] = im
+        labels['cls'] = cls
+        labels['instances'] = instances
+        return labels
+
+
+class Albumentations:
+    """YOLOv8 Albumentations class (optional, only used if package is installed)"""
+
+    def __init__(self, p=1.0):
+        """Initialize the transform object for YOLO bbox formatted params."""
+        self.p = p
+        self.transform = None
+        prefix = colorstr('albumentations: ')
+        try:
+            import albumentations as A
+
+            check_version(A.__version__, '1.0.3', hard=True)  # version requirement
+
+            T = [
+                A.Blur(p=0.01),
+                A.MedianBlur(p=0.01),
+                A.ToGray(p=0.01),
+                A.CLAHE(p=0.01),
+                A.RandomBrightnessContrast(p=0.0),
+                A.RandomGamma(p=0.0),
+                A.ImageCompression(quality_lower=75, p=0.0)]  # transforms
+            self.transform = A.Compose(T, bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))
+
+            LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p))
+        except ImportError:  # package not installed, skip
+            pass
+        except Exception as e:
+            LOGGER.info(f'{prefix}{e}')
+
+    def __call__(self, labels):
+        """Generates object detections and returns a dictionary with detection results."""
+        im = labels['img']
+        cls = labels['cls']
+        if len(cls):
+            labels['instances'].convert_bbox('xywh')
+            labels['instances'].normalize(*im.shape[:2][::-1])
+            bboxes = labels['instances'].bboxes
+            # TODO: add supports of segments and keypoints
+            if self.transform and random.random() < self.p:
+                new = self.transform(image=im, bboxes=bboxes, class_labels=cls)  # transformed
+                if len(new['class_labels']) > 0:  # skip update if no bbox in new im
+                    labels['img'] = new['image']
+                    labels['cls'] = np.array(new['class_labels'])
+                    bboxes = np.array(new['bboxes'], dtype=np.float32)
+            labels['instances'].update(bboxes=bboxes)
+        return labels
+
+
+# TODO: technically this is not an augmentation, maybe we should put this to another files
+class Format:
+
+    def __init__(self,
+                 bbox_format='xywh',
+                 normalize=True,
+                 return_mask=False,
+                 return_keypoint=False,
+                 mask_ratio=4,
+                 mask_overlap=True,
+                 batch_idx=True):
+        self.bbox_format = bbox_format
+        self.normalize = normalize
+        self.return_mask = return_mask  # set False when training detection only
+        self.return_keypoint = return_keypoint
+        self.mask_ratio = mask_ratio
+        self.mask_overlap = mask_overlap
+        self.batch_idx = batch_idx  # keep the batch indexes
+
+    def __call__(self, labels):
+        """Return formatted image, classes, bounding boxes & keypoints to be used by 'collate_fn'."""
+        img = labels.pop('img')
+        h, w = img.shape[:2]
+        cls = labels.pop('cls')
+        instances = labels.pop('instances')
+        instances.convert_bbox(format=self.bbox_format)
+        instances.denormalize(w, h)
+        nl = len(instances)
+
+        if self.return_mask:
+            if nl:
+                masks, instances, cls = self._format_segments(instances, cls, w, h)
+                masks = torch.from_numpy(masks)
+            else:
+                masks = torch.zeros(1 if self.mask_overlap else nl, img.shape[0] // self.mask_ratio,
+                                    img.shape[1] // self.mask_ratio)
+            labels['masks'] = masks
+        if self.normalize:
+            instances.normalize(w, h)
+        labels['img'] = self._format_img(img)
+        labels['cls'] = torch.from_numpy(cls) if nl else torch.zeros(nl)
+        labels['bboxes'] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4))
+        if self.return_keypoint:
+            labels['keypoints'] = torch.from_numpy(instances.keypoints)
+        # Then we can use collate_fn
+        if self.batch_idx:
+            labels['batch_idx'] = torch.zeros(nl)
+        return labels
+
+    def _format_img(self, img):
+        """Format the image for YOLOv5 from Numpy array to PyTorch tensor."""
+        if len(img.shape) < 3:
+            img = np.expand_dims(img, -1)
+        img = np.ascontiguousarray(img.transpose(2, 0, 1)[::-1])
+        img = torch.from_numpy(img)
+        return img
+
+    def _format_segments(self, instances, cls, w, h):
+        """convert polygon points to bitmap."""
+        segments = instances.segments
+        if self.mask_overlap:
+            masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio)
+            masks = masks[None]  # (640, 640) -> (1, 640, 640)
+            instances = instances[sorted_idx]
+            cls = cls[sorted_idx]
+        else:
+            masks = polygons2masks((h, w), segments, color=1, downsample_ratio=self.mask_ratio)
+
+        return masks, instances, cls
+
+
+def v8_transforms(dataset, imgsz, hyp, stretch=False):
+    """Convert images to a size suitable for YOLOv8 training."""
+    pre_transform = Compose([
+        Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic),
+        CopyPaste(p=hyp.copy_paste),
+        RandomPerspective(
+            degrees=hyp.degrees,
+            translate=hyp.translate,
+            scale=hyp.scale,
+            shear=hyp.shear,
+            perspective=hyp.perspective,
+            pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
+        )])
+    flip_idx = dataset.data.get('flip_idx', [])  # for keypoints augmentation
+    if dataset.use_keypoints:
+        kpt_shape = dataset.data.get('kpt_shape', None)
+        if len(flip_idx) == 0 and hyp.fliplr > 0.0:
+            hyp.fliplr = 0.0
+            LOGGER.warning("WARNING ⚠️ No 'flip_idx' array defined in data.yaml, setting augmentation 'fliplr=0.0'")
+        elif flip_idx and (len(flip_idx) != kpt_shape[0]):
+            raise ValueError(f'data.yaml flip_idx={flip_idx} length must be equal to kpt_shape[0]={kpt_shape[0]}')
+
+    return Compose([
+        pre_transform,
+        MixUp(dataset, pre_transform=pre_transform, p=hyp.mixup),
+        Albumentations(p=1.0),
+        RandomHSV(hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v),
+        RandomFlip(direction='vertical', p=hyp.flipud),
+        RandomFlip(direction='horizontal', p=hyp.fliplr, flip_idx=flip_idx)])  # transforms
+
+
+# Classification augmentations -----------------------------------------------------------------------------------------
+def classify_transforms(size=224, mean=(0.0, 0.0, 0.0), std=(1.0, 1.0, 1.0)):  # IMAGENET_MEAN, IMAGENET_STD
+    # Transforms to apply if albumentations not installed
+    if not isinstance(size, int):
+        raise TypeError(f'classify_transforms() size {size} must be integer, not (list, tuple)')
+    if any(mean) or any(std):
+        return T.Compose([CenterCrop(size), ToTensor(), T.Normalize(mean, std, inplace=True)])
+    else:
+        return T.Compose([CenterCrop(size), ToTensor()])
+
+
+def hsv2colorjitter(h, s, v):
+    """Map HSV (hue, saturation, value) jitter into ColorJitter values (brightness, contrast, saturation, hue)"""
+    return v, v, s, h
+
+
+def classify_albumentations(
+        augment=True,
+        size=224,
+        scale=(0.08, 1.0),
+        hflip=0.5,
+        vflip=0.0,
+        hsv_h=0.015,  # image HSV-Hue augmentation (fraction)
+        hsv_s=0.7,  # image HSV-Saturation augmentation (fraction)
+        hsv_v=0.4,  # image HSV-Value augmentation (fraction)
+        mean=(0.0, 0.0, 0.0),  # IMAGENET_MEAN
+        std=(1.0, 1.0, 1.0),  # IMAGENET_STD
+        auto_aug=False,
+):
+    """YOLOv8 classification Albumentations (optional, only used if package is installed)."""
+    prefix = colorstr('albumentations: ')
+    try:
+        import albumentations as A
+        from albumentations.pytorch import ToTensorV2
+
+        check_version(A.__version__, '1.0.3', hard=True)  # version requirement
+        if augment:  # Resize and crop
+            T = [A.RandomResizedCrop(height=size, width=size, scale=scale)]
+            if auto_aug:
+                # TODO: implement AugMix, AutoAug & RandAug in albumentations
+                LOGGER.info(f'{prefix}auto augmentations are currently not supported')
+            else:
+                if hflip > 0:
+                    T += [A.HorizontalFlip(p=hflip)]
+                if vflip > 0:
+                    T += [A.VerticalFlip(p=vflip)]
+                if any((hsv_h, hsv_s, hsv_v)):
+                    T += [A.ColorJitter(*hsv2colorjitter(hsv_h, hsv_s, hsv_v))]  # brightness, contrast, saturation, hue
+        else:  # Use fixed crop for eval set (reproducibility)
+            T = [A.SmallestMaxSize(max_size=size), A.CenterCrop(height=size, width=size)]
+        T += [A.Normalize(mean=mean, std=std), ToTensorV2()]  # Normalize and convert to Tensor
+        LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p))
+        return A.Compose(T)
+
+    except ImportError:  # package not installed, skip
+        pass
+    except Exception as e:
+        LOGGER.info(f'{prefix}{e}')
+
+
+class ClassifyLetterBox:
+    """YOLOv8 LetterBox class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])"""
+
+    def __init__(self, size=(640, 640), auto=False, stride=32):
+        """Resizes image and crops it to center with max dimensions 'h' and 'w'."""
+        super().__init__()
+        self.h, self.w = (size, size) if isinstance(size, int) else size
+        self.auto = auto  # pass max size integer, automatically solve for short side using stride
+        self.stride = stride  # used with auto
+
+    def __call__(self, im):  # im = np.array HWC
+        imh, imw = im.shape[:2]
+        r = min(self.h / imh, self.w / imw)  # ratio of new/old
+        h, w = round(imh * r), round(imw * r)  # resized image
+        hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else self.h, self.w
+        top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
+        im_out = np.full((self.h, self.w, 3), 114, dtype=im.dtype)
+        im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
+        return im_out
+
+
+class CenterCrop:
+    """YOLOv8 CenterCrop class for image preprocessing, i.e. T.Compose([CenterCrop(size), ToTensor()])"""
+
+    def __init__(self, size=640):
+        """Converts an image from numpy array to PyTorch tensor."""
+        super().__init__()
+        self.h, self.w = (size, size) if isinstance(size, int) else size
+
+    def __call__(self, im):  # im = np.array HWC
+        imh, imw = im.shape[:2]
+        m = min(imh, imw)  # min dimension
+        top, left = (imh - m) // 2, (imw - m) // 2
+        return cv2.resize(im[top:top + m, left:left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR)
+
+
+class ToTensor:
+    """YOLOv8 ToTensor class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])."""
+
+    def __init__(self, half=False):
+        """Initialize YOLOv8 ToTensor object with optional half-precision support."""
+        super().__init__()
+        self.half = half
+
+    def __call__(self, im):  # im = np.array HWC in BGR order
+        im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1])  # HWC to CHW -> BGR to RGB -> contiguous
+        im = torch.from_numpy(im)  # to torch
+        im = im.half() if self.half else im.float()  # uint8 to fp16/32
+        im /= 255.0  # 0-255 to 0.0-1.0
+        return im
--- a/ultralytics/data/base.py
+++ b/ultralytics/data/base.py
@ -0,0 +1,287 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import glob
+import math
+import os
+import random
+from copy import deepcopy
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from typing import Optional
+
+import cv2
+import numpy as np
+import psutil
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM_BAR_FORMAT
+
+from .utils import HELP_URL, IMG_FORMATS
+
+
+class BaseDataset(Dataset):
+    """
+    Base dataset class for loading and processing image data.
+
+    Args:
+        img_path (str): Path to the folder containing images.
+        imgsz (int, optional): Image size. Defaults to 640.
+        cache (bool, optional): Cache images to RAM or disk during training. Defaults to False.
+        augment (bool, optional): If True, data augmentation is applied. Defaults to True.
+        hyp (dict, optional): Hyperparameters to apply data augmentation. Defaults to None.
+        prefix (str, optional): Prefix to print in log messages. Defaults to ''.
+        rect (bool, optional): If True, rectangular training is used. Defaults to False.
+        batch_size (int, optional): Size of batches. Defaults to None.
+        stride (int, optional): Stride. Defaults to 32.
+        pad (float, optional): Padding. Defaults to 0.0.
+        single_cls (bool, optional): If True, single class training is used. Defaults to False.
+        classes (list): List of included classes. Default is None.
+        fraction (float): Fraction of dataset to utilize. Default is 1.0 (use all data).
+
+    Attributes:
+        im_files (list): List of image file paths.
+        labels (list): List of label data dictionaries.
+        ni (int): Number of images in the dataset.
+        ims (list): List of loaded images.
+        npy_files (list): List of numpy file paths.
+        transforms (callable): Image transformation function.
+    """
+
+    def __init__(self,
+                 img_path,
+                 imgsz=640,
+                 cache=False,
+                 augment=True,
+                 hyp=DEFAULT_CFG,
+                 prefix='',
+                 rect=False,
+                 batch_size=16,
+                 stride=32,
+                 pad=0.5,
+                 single_cls=False,
+                 classes=None,
+                 fraction=1.0):
+        super().__init__()
+        self.img_path = img_path
+        self.imgsz = imgsz
+        self.augment = augment
+        self.single_cls = single_cls
+        self.prefix = prefix
+        self.fraction = fraction
+        self.im_files = self.get_img_files(self.img_path)
+        self.labels = self.get_labels()
+        self.update_labels(include_class=classes)  # single_cls and include_class
+        self.ni = len(self.labels)  # number of images
+        self.rect = rect
+        self.batch_size = batch_size
+        self.stride = stride
+        self.pad = pad
+        if self.rect:
+            assert self.batch_size is not None
+            self.set_rectangle()
+
+        # Buffer thread for mosaic images
+        self.buffer = []  # buffer size = batch size
+        self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0
+
+        # Cache stuff
+        if cache == 'ram' and not self.check_cache_ram():
+            cache = False
+        self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
+        self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files]
+        if cache:
+            self.cache_images(cache)
+
+        # Transforms
+        self.transforms = self.build_transforms(hyp=hyp)
+
+    def get_img_files(self, img_path):
+        """Read image files."""
+        try:
+            f = []  # image files
+            for p in img_path if isinstance(img_path, list) else [img_path]:
+                p = Path(p)  # os-agnostic
+                if p.is_dir():  # dir
+                    f += glob.glob(str(p / '**' / '*.*'), recursive=True)
+                    # F = list(p.rglob('*.*'))  # pathlib
+                elif p.is_file():  # file
+                    with open(p) as t:
+                        t = t.read().strip().splitlines()
+                        parent = str(p.parent) + os.sep
+                        f += [x.replace('./', parent) if x.startswith('./') else x for x in t]  # local to global path
+                        # F += [p.parent / x.lstrip(os.sep) for x in t]  # local to global path (pathlib)
+                else:
+                    raise FileNotFoundError(f'{self.prefix}{p} does not exist')
+            im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
+            # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS])  # pathlib
+            assert im_files, f'{self.prefix}No images found'
+        except Exception as e:
+            raise FileNotFoundError(f'{self.prefix}Error loading data from {img_path}\n{HELP_URL}') from e
+        if self.fraction < 1:
+            im_files = im_files[:round(len(im_files) * self.fraction)]
+        return im_files
+
+    def update_labels(self, include_class: Optional[list]):
+        """include_class, filter labels to include only these classes (optional)."""
+        include_class_array = np.array(include_class).reshape(1, -1)
+        for i in range(len(self.labels)):
+            if include_class is not None:
+                cls = self.labels[i]['cls']
+                bboxes = self.labels[i]['bboxes']
+                segments = self.labels[i]['segments']
+                keypoints = self.labels[i]['keypoints']
+                j = (cls == include_class_array).any(1)
+                self.labels[i]['cls'] = cls[j]
+                self.labels[i]['bboxes'] = bboxes[j]
+                if segments:
+                    self.labels[i]['segments'] = [segments[si] for si, idx in enumerate(j) if idx]
+                if keypoints is not None:
+                    self.labels[i]['keypoints'] = keypoints[j]
+            if self.single_cls:
+                self.labels[i]['cls'][:, 0] = 0
+
+    def load_image(self, i):
+        """Loads 1 image from dataset index 'i', returns (im, resized hw)."""
+        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
+        if im is None:  # not cached in RAM
+            if fn.exists():  # load npy
+                im = np.load(fn)
+            else:  # read image
+                im = cv2.imread(f)  # BGR
+                if im is None:
+                    raise FileNotFoundError(f'Image Not Found {f}')
+            h0, w0 = im.shape[:2]  # orig hw
+            r = self.imgsz / max(h0, w0)  # ratio
+            if r != 1:  # if sizes are not equal
+                interp = cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA
+                im = cv2.resize(im, (min(math.ceil(w0 * r), self.imgsz), min(math.ceil(h0 * r), self.imgsz)),
+                                interpolation=interp)
+
+            # Add to buffer if training with augmentations
+            if self.augment:
+                self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
+                self.buffer.append(i)
+                if len(self.buffer) >= self.max_buffer_length:
+                    j = self.buffer.pop(0)
+                    self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
+
+            return im, (h0, w0), im.shape[:2]
+
+        return self.ims[i], self.im_hw0[i], self.im_hw[i]
+
+    def cache_images(self, cache):
+        """Cache images to memory or disk."""
+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
+        fcn = self.cache_images_to_disk if cache == 'disk' else self.load_image
+        with ThreadPool(NUM_THREADS) as pool:
+            results = pool.imap(fcn, range(self.ni))
+            pbar = tqdm(enumerate(results), total=self.ni, bar_format=TQDM_BAR_FORMAT, disable=LOCAL_RANK > 0)
+            for i, x in pbar:
+                if cache == 'disk':
+                    b += self.npy_files[i].stat().st_size
+                else:  # 'ram'
+                    self.ims[i], self.im_hw0[i], self.im_hw[i] = x  # im, hw_orig, hw_resized = load_image(self, i)
+                    b += self.ims[i].nbytes
+                pbar.desc = f'{self.prefix}Caching images ({b / gb:.1f}GB {cache})'
+            pbar.close()
+
+    def cache_images_to_disk(self, i):
+        """Saves an image as an *.npy file for faster loading."""
+        f = self.npy_files[i]
+        if not f.exists():
+            np.save(f.as_posix(), cv2.imread(self.im_files[i]))
+
+    def check_cache_ram(self, safety_margin=0.5):
+        """Check image caching requirements vs available memory."""
+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
+        n = min(self.ni, 30)  # extrapolate from 30 random images
+        for _ in range(n):
+            im = cv2.imread(random.choice(self.im_files))  # sample image
+            ratio = self.imgsz / max(im.shape[0], im.shape[1])  # max(h, w)  # ratio
+            b += im.nbytes * ratio ** 2
+        mem_required = b * self.ni / n * (1 + safety_margin)  # GB required to cache dataset into RAM
+        mem = psutil.virtual_memory()
+        cache = mem_required < mem.available  # to cache or not to cache, that is the question
+        if not cache:
+            LOGGER.info(f'{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images '
+                        f'with {int(safety_margin * 100)}% safety margin but only '
+                        f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, '
+                        f"{'caching images ✅' if cache else 'not caching images ⚠️'}")
+        return cache
+
+    def set_rectangle(self):
+        """Sets the shape of bounding boxes for YOLO detections as rectangles."""
+        bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int)  # batch index
+        nb = bi[-1] + 1  # number of batches
+
+        s = np.array([x.pop('shape') for x in self.labels])  # hw
+        ar = s[:, 0] / s[:, 1]  # aspect ratio
+        irect = ar.argsort()
+        self.im_files = [self.im_files[i] for i in irect]
+        self.labels = [self.labels[i] for i in irect]
+        ar = ar[irect]
+
+        # Set training image shapes
+        shapes = [[1, 1]] * nb
+        for i in range(nb):
+            ari = ar[bi == i]
+            mini, maxi = ari.min(), ari.max()
+            if maxi < 1:
+                shapes[i] = [maxi, 1]
+            elif mini > 1:
+                shapes[i] = [1, 1 / mini]
+
+        self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride
+        self.batch = bi  # batch index of image
+
+    def __getitem__(self, index):
+        """Returns transformed label information for given index."""
+        return self.transforms(self.get_image_and_label(index))
+
+    def get_image_and_label(self, index):
+        """Get and return label information from the dataset."""
+        label = deepcopy(self.labels[index])  # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948
+        label.pop('shape', None)  # shape is for rect, remove it
+        label['img'], label['ori_shape'], label['resized_shape'] = self.load_image(index)
+        label['ratio_pad'] = (label['resized_shape'][0] / label['ori_shape'][0],
+                              label['resized_shape'][1] / label['ori_shape'][1])  # for evaluation
+        if self.rect:
+            label['rect_shape'] = self.batch_shapes[self.batch[index]]
+        return self.update_labels_info(label)
+
+    def __len__(self):
+        """Returns the length of the labels list for the dataset."""
+        return len(self.labels)
+
+    def update_labels_info(self, label):
+        """custom your label format here."""
+        return label
+
+    def build_transforms(self, hyp=None):
+        """Users can custom augmentations here
+        like:
+            if self.augment:
+                # Training transforms
+                return Compose([])
+            else:
+                # Val transforms
+                return Compose([])
+        """
+        raise NotImplementedError
+
+    def get_labels(self):
+        """Users can custom their own format here.
+        Make sure your output is a list with each element like below:
+            dict(
+                im_file=im_file,
+                shape=shape,  # format: (height, width)
+                cls=cls,
+                bboxes=bboxes, # xywh
+                segments=segments,  # xy
+                keypoints=keypoints, # xy
+                normalized=True, # or False
+                bbox_format="xyxy",  # or xywh, ltwh
+            )
+        """
+        raise NotImplementedError
--- a/ultralytics/data/build.py
+++ b/ultralytics/data/build.py
@ -0,0 +1,170 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import dataloader, distributed
+
+from ultralytics.data.loaders import (LOADERS, LoadImages, LoadPilAndNumpy, LoadScreenshots, LoadStreams, LoadTensor,
+                                      SourceTypes, autocast_list)
+from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
+from ultralytics.utils import RANK, colorstr
+from ultralytics.utils.checks import check_file
+
+from .dataset import YOLODataset
+from .utils import PIN_MEMORY
+
+
+class InfiniteDataLoader(dataloader.DataLoader):
+    """Dataloader that reuses workers. Uses same syntax as vanilla DataLoader."""
+
+    def __init__(self, *args, **kwargs):
+        """Dataloader that infinitely recycles workers, inherits from DataLoader."""
+        super().__init__(*args, **kwargs)
+        object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler))
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        """Returns the length of the batch sampler's sampler."""
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        """Creates a sampler that repeats indefinitely."""
+        for _ in range(len(self)):
+            yield next(self.iterator)
+
+    def reset(self):
+        """Reset iterator.
+        This is useful when we want to modify settings of dataset while training.
+        """
+        self.iterator = self._get_iterator()
+
+
+class _RepeatSampler:
+    """
+    Sampler that repeats forever.
+
+    Args:
+        sampler (Dataset.sampler): The sampler to repeat.
+    """
+
+    def __init__(self, sampler):
+        """Initializes an object that repeats a given sampler indefinitely."""
+        self.sampler = sampler
+
+    def __iter__(self):
+        """Iterates over the 'sampler' and yields its contents."""
+        while True:
+            yield from iter(self.sampler)
+
+
+def seed_worker(worker_id):  # noqa
+    """Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader."""
+    worker_seed = torch.initial_seed() % 2 ** 32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+def build_yolo_dataset(cfg, img_path, batch, data, mode='train', rect=False, stride=32):
+    """Build YOLO Dataset"""
+    return YOLODataset(
+        img_path=img_path,
+        imgsz=cfg.imgsz,
+        batch_size=batch,
+        augment=mode == 'train',  # augmentation
+        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
+        rect=cfg.rect or rect,  # rectangular batches
+        cache=cfg.cache or None,
+        single_cls=cfg.single_cls or False,
+        stride=int(stride),
+        pad=0.0 if mode == 'train' else 0.5,
+        prefix=colorstr(f'{mode}: '),
+        use_segments=cfg.task == 'segment',
+        use_keypoints=cfg.task == 'pose',
+        classes=cfg.classes,
+        data=data,
+        fraction=cfg.fraction if mode == 'train' else 1.0)
+
+
+def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1):
+    """Return an InfiniteDataLoader or DataLoader for training or validation set."""
+    batch = min(batch, len(dataset))
+    nd = torch.cuda.device_count()  # number of CUDA devices
+    nw = min([os.cpu_count() // max(nd, 1), batch if batch > 1 else 0, workers])  # number of workers
+    sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
+    generator = torch.Generator()
+    generator.manual_seed(6148914691236517205 + RANK)
+    return InfiniteDataLoader(dataset=dataset,
+                              batch_size=batch,
+                              shuffle=shuffle and sampler is None,
+                              num_workers=nw,
+                              sampler=sampler,
+                              pin_memory=PIN_MEMORY,
+                              collate_fn=getattr(dataset, 'collate_fn', None),
+                              worker_init_fn=seed_worker,
+                              generator=generator)
+
+
+def check_source(source):
+    """Check source type and return corresponding flag values."""
+    webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False
+    if isinstance(source, (str, int, Path)):  # int for local usb camera
+        source = str(source)
+        is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
+        is_url = source.lower().startswith(('https://', 'http://', 'rtsp://', 'rtmp://'))
+        webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file)
+        screenshot = source.lower() == 'screen'
+        if is_url and is_file:
+            source = check_file(source)  # download
+    elif isinstance(source, tuple(LOADERS)):
+        in_memory = True
+    elif isinstance(source, (list, tuple)):
+        source = autocast_list(source)  # convert all list elements to PIL or np arrays
+        from_img = True
+    elif isinstance(source, (Image.Image, np.ndarray)):
+        from_img = True
+    elif isinstance(source, torch.Tensor):
+        tensor = True
+    else:
+        raise TypeError('Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict')
+
+    return source, webcam, screenshot, from_img, in_memory, tensor
+
+
+def load_inference_source(source=None, imgsz=640, vid_stride=1):
+    """
+    Loads an inference source for object detection and applies necessary transformations.
+
+    Args:
+        source (str, Path, Tensor, PIL.Image, np.ndarray): The input source for inference.
+        imgsz (int, optional): The size of the image for inference. Default is 640.
+        vid_stride (int, optional): The frame interval for video sources. Default is 1.
+
+    Returns:
+        dataset (Dataset): A dataset object for the specified input source.
+    """
+    source, webcam, screenshot, from_img, in_memory, tensor = check_source(source)
+    source_type = source.source_type if in_memory else SourceTypes(webcam, screenshot, from_img, tensor)
+
+    # Dataloader
+    if tensor:
+        dataset = LoadTensor(source)
+    elif in_memory:
+        dataset = source
+    elif webcam:
+        dataset = LoadStreams(source, imgsz=imgsz, vid_stride=vid_stride)
+    elif screenshot:
+        dataset = LoadScreenshots(source, imgsz=imgsz)
+    elif from_img:
+        dataset = LoadPilAndNumpy(source, imgsz=imgsz)
+    else:
+        dataset = LoadImages(source, imgsz=imgsz, vid_stride=vid_stride)
+
+    # Attach source types to the dataset
+    setattr(dataset, 'source_type', source_type)
+
+    return dataset
--- a/ultralytics/data/converter.py
+++ b/ultralytics/data/converter.py
@ -0,0 +1,230 @@
+import json
+from collections import defaultdict
+from pathlib import Path
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+from ultralytics.utils.checks import check_requirements
+from ultralytics.utils.files import make_dirs
+
+
+def coco91_to_coco80_class():
+    """Converts 91-index COCO class IDs to 80-index COCO class IDs.
+
+    Returns:
+        (list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the
+            corresponding 91-index class ID.
+
+    """
+    return [
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, None, 24, 25, None,
+        None, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, None, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+        51, 52, 53, 54, 55, 56, 57, 58, 59, None, 60, None, None, 61, None, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
+        None, 73, 74, 75, 76, 77, 78, 79, None]
+
+
+def convert_coco(labels_dir='../coco/annotations/', use_segments=False, use_keypoints=False, cls91to80=True):
+    """Converts COCO dataset annotations to a format suitable for training YOLOv5 models.
+
+    Args:
+        labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
+        use_segments (bool, optional): Whether to include segmentation masks in the output.
+        use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
+        cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
+
+    Raises:
+        FileNotFoundError: If the labels_dir path does not exist.
+
+    Example Usage:
+        convert_coco(labels_dir='../coco/annotations/', use_segments=True, use_keypoints=True, cls91to80=True)
+
+    Output:
+        Generates output files in the specified output directory.
+    """
+
+    save_dir = make_dirs('yolo_labels')  # output directory
+    coco80 = coco91_to_coco80_class()
+
+    # Import json
+    for json_file in sorted(Path(labels_dir).resolve().glob('*.json')):
+        fn = Path(save_dir) / 'labels' / json_file.stem.replace('instances_', '')  # folder name
+        fn.mkdir(parents=True, exist_ok=True)
+        with open(json_file) as f:
+            data = json.load(f)
+
+        # Create image dict
+        images = {f'{x["id"]:d}': x for x in data['images']}
+        # Create image-annotations dict
+        imgToAnns = defaultdict(list)
+        for ann in data['annotations']:
+            imgToAnns[ann['image_id']].append(ann)
+
+        # Write labels file
+        for img_id, anns in tqdm(imgToAnns.items(), desc=f'Annotations {json_file}'):
+            img = images[f'{img_id:d}']
+            h, w, f = img['height'], img['width'], img['file_name']
+
+            bboxes = []
+            segments = []
+            keypoints = []
+            for ann in anns:
+                if ann['iscrowd']:
+                    continue
+                # The COCO box format is [top left x, top left y, width, height]
+                box = np.array(ann['bbox'], dtype=np.float64)
+                box[:2] += box[2:] / 2  # xy top-left corner to center
+                box[[0, 2]] /= w  # normalize x
+                box[[1, 3]] /= h  # normalize y
+                if box[2] <= 0 or box[3] <= 0:  # if w <= 0 and h <= 0
+                    continue
+
+                cls = coco80[ann['category_id'] - 1] if cls91to80 else ann['category_id'] - 1  # class
+                box = [cls] + box.tolist()
+                if box not in bboxes:
+                    bboxes.append(box)
+                if use_segments and ann.get('segmentation') is not None:
+                    if len(ann['segmentation']) == 0:
+                        segments.append([])
+                        continue
+                    if isinstance(ann['segmentation'], dict):
+                        ann['segmentation'] = rle2polygon(ann['segmentation'])
+                    if len(ann['segmentation']) > 1:
+                        s = merge_multi_segment(ann['segmentation'])
+                        s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
+                    else:
+                        s = [j for i in ann['segmentation'] for j in i]  # all segments concatenated
+                        s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
+                    s = [cls] + s
+                    if s not in segments:
+                        segments.append(s)
+                if use_keypoints and ann.get('keypoints') is not None:
+                    k = (np.array(ann['keypoints']).reshape(-1, 3) / np.array([w, h, 1])).reshape(-1).tolist()
+                    k = box + k
+                    keypoints.append(k)
+
+            # Write
+            with open((fn / f).with_suffix('.txt'), 'a') as file:
+                for i in range(len(bboxes)):
+                    if use_keypoints:
+                        line = *(keypoints[i]),  # cls, box, keypoints
+                    else:
+                        line = *(segments[i]
+                                 if use_segments and len(segments[i]) > 0 else bboxes[i]),  # cls, box or segments
+                    file.write(('%g ' * len(line)).rstrip() % line + '\n')
+
+
+def rle2polygon(segmentation):
+    """
+    Convert Run-Length Encoding (RLE) mask to polygon coordinates.
+
+    Args:
+        segmentation (dict, list): RLE mask representation of the object segmentation.
+
+    Returns:
+        (list): A list of lists representing the polygon coordinates for each contour.
+
+    Note:
+        Requires the 'pycocotools' package to be installed.
+    """
+    check_requirements('pycocotools')
+    from pycocotools import mask
+
+    m = mask.decode(segmentation)
+    m[m > 0] = 255
+    contours, _ = cv2.findContours(m, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS)
+    polygons = []
+    for contour in contours:
+        epsilon = 0.001 * cv2.arcLength(contour, True)
+        contour_approx = cv2.approxPolyDP(contour, epsilon, True)
+        polygon = contour_approx.flatten().tolist()
+        polygons.append(polygon)
+    return polygons
+
+
+def min_index(arr1, arr2):
+    """
+    Find a pair of indexes with the shortest distance between two arrays of 2D points.
+
+    Args:
+        arr1 (np.array): A NumPy array of shape (N, 2) representing N 2D points.
+        arr2 (np.array): A NumPy array of shape (M, 2) representing M 2D points.
+
+    Returns:
+        (tuple): A tuple containing the indexes of the points with the shortest distance in arr1 and arr2 respectively.
+    """
+    dis = ((arr1[:, None, :] - arr2[None, :, :]) ** 2).sum(-1)
+    return np.unravel_index(np.argmin(dis, axis=None), dis.shape)
+
+
+def merge_multi_segment(segments):
+    """
+    Merge multiple segments into one list by connecting the coordinates with the minimum distance between each segment.
+    This function connects these coordinates with a thin line to merge all segments into one.
+
+    Args:
+        segments (List[List]): Original segmentations in COCO's JSON file.
+                               Each element is a list of coordinates, like [segmentation1, segmentation2,...].
+
+    Returns:
+        s (List[np.ndarray]): A list of connected segments represented as NumPy arrays.
+    """
+    s = []
+    segments = [np.array(i).reshape(-1, 2) for i in segments]
+    idx_list = [[] for _ in range(len(segments))]
+
+    # record the indexes with min distance between each segment
+    for i in range(1, len(segments)):
+        idx1, idx2 = min_index(segments[i - 1], segments[i])
+        idx_list[i - 1].append(idx1)
+        idx_list[i].append(idx2)
+
+    # use two round to connect all the segments
+    for k in range(2):
+        # forward connection
+        if k == 0:
+            for i, idx in enumerate(idx_list):
+                # middle segments have two indexes
+                # reverse the index of middle segments
+                if len(idx) == 2 and idx[0] > idx[1]:
+                    idx = idx[::-1]
+                    segments[i] = segments[i][::-1, :]
+
+                segments[i] = np.roll(segments[i], -idx[0], axis=0)
+                segments[i] = np.concatenate([segments[i], segments[i][:1]])
+                # deal with the first segment and the last one
+                if i in [0, len(idx_list) - 1]:
+                    s.append(segments[i])
+                else:
+                    idx = [0, idx[1] - idx[0]]
+                    s.append(segments[i][idx[0]:idx[1] + 1])
+
+        else:
+            for i in range(len(idx_list) - 1, -1, -1):
+                if i not in [0, len(idx_list) - 1]:
+                    idx = idx_list[i]
+                    nidx = abs(idx[1] - idx[0])
+                    s.append(segments[i][nidx:])
+    return s
+
+
+def delete_dsstore(path='../datasets'):
+    """Delete Apple .DS_Store files in the specified directory and its subdirectories."""
+    from pathlib import Path
+
+    files = list(Path(path).rglob('.DS_store'))
+    print(files)
+    for f in files:
+        f.unlink()
+
+
+if __name__ == '__main__':
+    source = 'COCO'
+
+    if source == 'COCO':
+        convert_coco(
+            '../datasets/coco/annotations',  # directory with *.json
+            use_segments=False,
+            use_keypoints=True,
+            cls91to80=False)
--- a/ultralytics/data/dataloaders/init.py
+++ b/ultralytics/data/dataloaders/init.py
--- a/ultralytics/data/dataset.py
+++ b/ultralytics/data/dataset.py
@ -0,0 +1,275 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from itertools import repeat
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+import torchvision
+from tqdm import tqdm
+
+from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM_BAR_FORMAT, is_dir_writeable
+
+from .augment import Compose, Format, Instances, LetterBox, classify_albumentations, classify_transforms, v8_transforms
+from .base import BaseDataset
+from .utils import HELP_URL, LOGGER, get_hash, img2label_paths, verify_image_label
+
+
+class YOLODataset(BaseDataset):
+    """
+    Dataset class for loading object detection and/or segmentation labels in YOLO format.
+
+    Args:
+        data (dict, optional): A dataset YAML dictionary. Defaults to None.
+        use_segments (bool, optional): If True, segmentation masks are used as labels. Defaults to False.
+        use_keypoints (bool, optional): If True, keypoints are used as labels. Defaults to False.
+
+    Returns:
+        (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
+    """
+    cache_version = '1.0.2'  # dataset labels *.cache version, >= 1.0.0 for YOLOv8
+    rand_interp_methods = [cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4]
+
+    def __init__(self, *args, data=None, use_segments=False, use_keypoints=False, **kwargs):
+        self.use_segments = use_segments
+        self.use_keypoints = use_keypoints
+        self.data = data
+        assert not (self.use_segments and self.use_keypoints), 'Can not use both segments and keypoints.'
+        super().__init__(*args, **kwargs)
+
+    def cache_labels(self, path=Path('./labels.cache')):
+        """Cache dataset labels, check images and read shapes.
+        Args:
+            path (Path): path where to save the cache file (default: Path('./labels.cache')).
+        Returns:
+            (dict): labels.
+        """
+        x = {'labels': []}
+        nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number missing, found, empty, corrupt, messages
+        desc = f'{self.prefix}Scanning {path.parent / path.stem}...'
+        total = len(self.im_files)
+        nkpt, ndim = self.data.get('kpt_shape', (0, 0))
+        if self.use_keypoints and (nkpt <= 0 or ndim not in (2, 3)):
+            raise ValueError("'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
+                             "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'")
+        with ThreadPool(NUM_THREADS) as pool:
+            results = pool.imap(func=verify_image_label,
+                                iterable=zip(self.im_files, self.label_files, repeat(self.prefix),
+                                             repeat(self.use_keypoints), repeat(len(self.data['names'])), repeat(nkpt),
+                                             repeat(ndim)))
+            pbar = tqdm(results, desc=desc, total=total, bar_format=TQDM_BAR_FORMAT)
+            for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
+                nm += nm_f
+                nf += nf_f
+                ne += ne_f
+                nc += nc_f
+                if im_file:
+                    x['labels'].append(
+                        dict(
+                            im_file=im_file,
+                            shape=shape,
+                            cls=lb[:, 0:1],  # n, 1
+                            bboxes=lb[:, 1:],  # n, 4
+                            segments=segments,
+                            keypoints=keypoint,
+                            normalized=True,
+                            bbox_format='xywh'))
+                if msg:
+                    msgs.append(msg)
+                pbar.desc = f'{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt'
+            pbar.close()
+
+        if msgs:
+            LOGGER.info('\n'.join(msgs))
+        if nf == 0:
+            LOGGER.warning(f'{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}')
+        x['hash'] = get_hash(self.label_files + self.im_files)
+        x['results'] = nf, nm, ne, nc, len(self.im_files)
+        x['msgs'] = msgs  # warnings
+        x['version'] = self.cache_version  # cache version
+        if is_dir_writeable(path.parent):
+            if path.exists():
+                path.unlink()  # remove *.cache file if exists
+            np.save(str(path), x)  # save cache for next time
+            path.with_suffix('.cache.npy').rename(path)  # remove .npy suffix
+            LOGGER.info(f'{self.prefix}New cache created: {path}')
+        else:
+            LOGGER.warning(f'{self.prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.')
+        return x
+
+    def get_labels(self):
+        """Returns dictionary of labels for YOLO training."""
+        self.label_files = img2label_paths(self.im_files)
+        cache_path = Path(self.label_files[0]).parent.with_suffix('.cache')
+        try:
+            import gc
+            gc.disable()  # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
+            cache, exists = np.load(str(cache_path), allow_pickle=True).item(), True  # load dict
+            gc.enable()
+            assert cache['version'] == self.cache_version  # matches current version
+            assert cache['hash'] == get_hash(self.label_files + self.im_files)  # identical hash
+        except (FileNotFoundError, AssertionError, AttributeError):
+            cache, exists = self.cache_labels(cache_path), False  # run cache ops
+
+        # Display cache
+        nf, nm, ne, nc, n = cache.pop('results')  # found, missing, empty, corrupt, total
+        if exists and LOCAL_RANK in (-1, 0):
+            d = f'Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt'
+            tqdm(None, desc=self.prefix + d, total=n, initial=n, bar_format=TQDM_BAR_FORMAT)  # display cache results
+            if cache['msgs']:
+                LOGGER.info('\n'.join(cache['msgs']))  # display warnings
+        if nf == 0:  # number of labels found
+            raise FileNotFoundError(f'{self.prefix}No labels found in {cache_path}, can not start training. {HELP_URL}')
+
+        # Read cache
+        [cache.pop(k) for k in ('hash', 'version', 'msgs')]  # remove items
+        labels = cache['labels']
+        self.im_files = [lb['im_file'] for lb in labels]  # update im_files
+
+        # Check if the dataset is all boxes or all segments
+        lengths = ((len(lb['cls']), len(lb['bboxes']), len(lb['segments'])) for lb in labels)
+        len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths))
+        if len_segments and len_boxes != len_segments:
+            LOGGER.warning(
+                f'WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, '
+                f'len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. '
+                'To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset.')
+            for lb in labels:
+                lb['segments'] = []
+        if len_cls == 0:
+            raise ValueError(f'All labels empty in {cache_path}, can not start training without labels. {HELP_URL}')
+        return labels
+
+    # TODO: use hyp config to set all these augmentations
+    def build_transforms(self, hyp=None):
+        """Builds and appends transforms to the list."""
+        if self.augment:
+            hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
+            hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
+            transforms = v8_transforms(self, self.imgsz, hyp)
+        else:
+            transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)])
+        transforms.append(
+            Format(bbox_format='xywh',
+                   normalize=True,
+                   return_mask=self.use_segments,
+                   return_keypoint=self.use_keypoints,
+                   batch_idx=True,
+                   mask_ratio=hyp.mask_ratio,
+                   mask_overlap=hyp.overlap_mask))
+        return transforms
+
+    def close_mosaic(self, hyp):
+        """Sets mosaic, copy_paste and mixup options to 0.0 and builds transformations."""
+        hyp.mosaic = 0.0  # set mosaic ratio=0.0
+        hyp.copy_paste = 0.0  # keep the same behavior as previous v8 close-mosaic
+        hyp.mixup = 0.0  # keep the same behavior as previous v8 close-mosaic
+        self.transforms = self.build_transforms(hyp)
+
+    def update_labels_info(self, label):
+        """custom your label format here."""
+        # NOTE: cls is not with bboxes now, classification and semantic segmentation need an independent cls label
+        # we can make it also support classification and semantic segmentation by add or remove some dict keys there.
+        bboxes = label.pop('bboxes')
+        segments = label.pop('segments')
+        keypoints = label.pop('keypoints', None)
+        bbox_format = label.pop('bbox_format')
+        normalized = label.pop('normalized')
+        label['instances'] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
+        return label
+
+    @staticmethod
+    def collate_fn(batch):
+        """Collates data samples into batches."""
+        new_batch = {}
+        keys = batch[0].keys()
+        values = list(zip(*[list(b.values()) for b in batch]))
+        for i, k in enumerate(keys):
+            value = values[i]
+            if k == 'img':
+                value = torch.stack(value, 0)
+            if k in ['masks', 'keypoints', 'bboxes', 'cls']:
+                value = torch.cat(value, 0)
+            new_batch[k] = value
+        new_batch['batch_idx'] = list(new_batch['batch_idx'])
+        for i in range(len(new_batch['batch_idx'])):
+            new_batch['batch_idx'][i] += i  # add target image index for build_targets()
+        new_batch['batch_idx'] = torch.cat(new_batch['batch_idx'], 0)
+        return new_batch
+
+
+# Classification dataloaders -------------------------------------------------------------------------------------------
+class ClassificationDataset(torchvision.datasets.ImageFolder):
+    """
+    YOLO Classification Dataset.
+
+    Args:
+        root (str): Dataset path.
+
+    Attributes:
+        cache_ram (bool): True if images should be cached in RAM, False otherwise.
+        cache_disk (bool): True if images should be cached on disk, False otherwise.
+        samples (list): List of samples containing file, index, npy, and im.
+        torch_transforms (callable): torchvision transforms applied to the dataset.
+        album_transforms (callable, optional): Albumentations transforms applied to the dataset if augment is True.
+    """
+
+    def __init__(self, root, args, augment=False, cache=False):
+        """
+        Initialize YOLO object with root, image size, augmentations, and cache settings.
+
+        Args:
+            root (str): Dataset path.
+            args (Namespace): Argument parser containing dataset related settings.
+            augment (bool, optional): True if dataset should be augmented, False otherwise. Defaults to False.
+            cache (bool | str | optional): Cache setting, can be True, False, 'ram' or 'disk'. Defaults to False.
+        """
+        super().__init__(root=root)
+        if augment and args.fraction < 1.0:  # reduce training fraction
+            self.samples = self.samples[:round(len(self.samples) * args.fraction)]
+        self.cache_ram = cache is True or cache == 'ram'
+        self.cache_disk = cache == 'disk'
+        self.samples = [list(x) + [Path(x[0]).with_suffix('.npy'), None] for x in self.samples]  # file, index, npy, im
+        self.torch_transforms = classify_transforms(args.imgsz)
+        self.album_transforms = classify_albumentations(
+            augment=augment,
+            size=args.imgsz,
+            scale=(1.0 - args.scale, 1.0),  # (0.08, 1.0)
+            hflip=args.fliplr,
+            vflip=args.flipud,
+            hsv_h=args.hsv_h,  # HSV-Hue augmentation (fraction)
+            hsv_s=args.hsv_s,  # HSV-Saturation augmentation (fraction)
+            hsv_v=args.hsv_v,  # HSV-Value augmentation (fraction)
+            mean=(0.0, 0.0, 0.0),  # IMAGENET_MEAN
+            std=(1.0, 1.0, 1.0),  # IMAGENET_STD
+            auto_aug=False) if augment else None
+
+    def __getitem__(self, i):
+        """Returns subset of data and targets corresponding to given indices."""
+        f, j, fn, im = self.samples[i]  # filename, index, filename.with_suffix('.npy'), image
+        if self.cache_ram and im is None:
+            im = self.samples[i][3] = cv2.imread(f)
+        elif self.cache_disk:
+            if not fn.exists():  # load npy
+                np.save(fn.as_posix(), cv2.imread(f))
+            im = np.load(fn)
+        else:  # read image
+            im = cv2.imread(f)  # BGR
+        if self.album_transforms:
+            sample = self.album_transforms(image=cv2.cvtColor(im, cv2.COLOR_BGR2RGB))['image']
+        else:
+            sample = self.torch_transforms(im)
+        return {'img': sample, 'cls': j}
+
+    def __len__(self) -> int:
+        return len(self.samples)
+
+
+# TODO: support semantic segmentation
+class SemanticDataset(BaseDataset):
+
+    def __init__(self):
+        """Initialize a SemanticDataset object."""
+        super().__init__()
--- a/ultralytics/data/loaders.py
+++ b/ultralytics/data/loaders.py
@ -0,0 +1,403 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import glob
+import math
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from threading import Thread
+from urllib.parse import urlparse
+
+import cv2
+import numpy as np
+import requests
+import torch
+from PIL import Image
+
+from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
+from ultralytics.utils import LOGGER, ROOT, is_colab, is_kaggle, ops
+from ultralytics.utils.checks import check_requirements
+
+
+@dataclass
+class SourceTypes:
+    webcam: bool = False
+    screenshot: bool = False
+    from_img: bool = False
+    tensor: bool = False
+
+
+class LoadStreams:
+    """YOLOv8 streamloader, i.e. `yolo predict source='rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP streams`."""
+
+    def __init__(self, sources='file.streams', imgsz=640, vid_stride=1):
+        """Initialize instance variables and check for consistent input stream shapes."""
+        torch.backends.cudnn.benchmark = True  # faster for fixed-size inference
+        self.mode = 'stream'
+        self.imgsz = imgsz
+        self.vid_stride = vid_stride  # video frame-rate stride
+        sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
+        n = len(sources)
+        self.sources = [ops.clean_str(x) for x in sources]  # clean source names for later
+        self.imgs, self.fps, self.frames, self.threads, self.shape = [[]] * n, [0] * n, [0] * n, [None] * n, [None] * n
+        for i, s in enumerate(sources):  # index, source
+            # Start thread to read frames from video stream
+            st = f'{i + 1}/{n}: {s}... '
+            if urlparse(s).hostname in ('www.youtube.com', 'youtube.com', 'youtu.be'):  # if source is YouTube video
+                # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/Zgi9g1ksQHc'
+                s = get_best_youtube_url(s)
+            s = eval(s) if s.isnumeric() else s  # i.e. s = '0' local webcam
+            if s == 0 and (is_colab() or is_kaggle()):
+                raise NotImplementedError("'source=0' webcam not supported in Colab and Kaggle notebooks. "
+                                          "Try running 'source=0' in a local environment.")
+            cap = cv2.VideoCapture(s)
+            if not cap.isOpened():
+                raise ConnectionError(f'{st}Failed to open {s}')
+            w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            fps = cap.get(cv2.CAP_PROP_FPS)  # warning: may return 0 or nan
+            self.frames[i] = max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float('inf')  # infinite stream fallback
+            self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30  # 30 FPS fallback
+
+            success, im = cap.read()  # guarantee first frame
+            if not success or im is None:
+                raise ConnectionError(f'{st}Failed to read images from {s}')
+            self.imgs[i].append(im)
+            self.shape[i] = im.shape
+            self.threads[i] = Thread(target=self.update, args=([i, cap, s]), daemon=True)
+            LOGGER.info(f'{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)')
+            self.threads[i].start()
+        LOGGER.info('')  # newline
+
+        # Check for common shapes
+        self.bs = self.__len__()
+
+    def update(self, i, cap, stream):
+        """Read stream `i` frames in daemon thread."""
+        n, f = 0, self.frames[i]  # frame number, frame array
+        while cap.isOpened() and n < f:
+            # Only read a new frame if the buffer is empty
+            if not self.imgs[i]:
+                n += 1
+                cap.grab()  # .read() = .grab() followed by .retrieve()
+                if n % self.vid_stride == 0:
+                    success, im = cap.retrieve()
+                    if success:
+                        self.imgs[i].append(im)  # add image to buffer
+                    else:
+                        LOGGER.warning('WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.')
+                        self.imgs[i].append(np.zeros(self.shape[i]))
+                        cap.open(stream)  # re-open stream if signal was lost
+            else:
+                time.sleep(0.01)  # wait until the buffer is empty
+
+    def __iter__(self):
+        """Iterates through YOLO image feed and re-opens unresponsive streams."""
+        self.count = -1
+        return self
+
+    def __next__(self):
+        """Returns source paths, transformed and original images for processing."""
+        self.count += 1
+
+        # Wait until a frame is available in each buffer
+        while not all(self.imgs):
+            if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord('q'):  # q to quit
+                cv2.destroyAllWindows()
+                raise StopIteration
+            time.sleep(1 / min(self.fps))
+
+        # Get and remove the next frame from imgs buffer
+        return self.sources, [x.pop(0) for x in self.imgs], None, ''
+
+    def __len__(self):
+        """Return the length of the sources object."""
+        return len(self.sources)  # 1E12 frames = 32 streams at 30 FPS for 30 years
+
+
+class LoadScreenshots:
+    """YOLOv8 screenshot dataloader, i.e. `yolo predict source=screen`."""
+
+    def __init__(self, source, imgsz=640):
+        """source = [screen_number left top width height] (pixels)."""
+        check_requirements('mss')
+        import mss  # noqa
+
+        source, *params = source.split()
+        self.screen, left, top, width, height = 0, None, None, None, None  # default to full screen 0
+        if len(params) == 1:
+            self.screen = int(params[0])
+        elif len(params) == 4:
+            left, top, width, height = (int(x) for x in params)
+        elif len(params) == 5:
+            self.screen, left, top, width, height = (int(x) for x in params)
+        self.imgsz = imgsz
+        self.mode = 'stream'
+        self.frame = 0
+        self.sct = mss.mss()
+        self.bs = 1
+
+        # Parse monitor shape
+        monitor = self.sct.monitors[self.screen]
+        self.top = monitor['top'] if top is None else (monitor['top'] + top)
+        self.left = monitor['left'] if left is None else (monitor['left'] + left)
+        self.width = width or monitor['width']
+        self.height = height or monitor['height']
+        self.monitor = {'left': self.left, 'top': self.top, 'width': self.width, 'height': self.height}
+
+    def __iter__(self):
+        """Returns an iterator of the object."""
+        return self
+
+    def __next__(self):
+        """mss screen capture: get raw pixels from the screen as np array."""
+        im0 = np.array(self.sct.grab(self.monitor))[:, :, :3]  # [:, :, :3] BGRA to BGR
+        s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: '
+
+        self.frame += 1
+        return str(self.screen), im0, None, s  # screen, img, original img, im0s, s
+
+
+class LoadImages:
+    """YOLOv8 image/video dataloader, i.e. `yolo predict source=image.jpg/vid.mp4`."""
+
+    def __init__(self, path, imgsz=640, vid_stride=1):
+        """Initialize the Dataloader and raise FileNotFoundError if file not found."""
+        if isinstance(path, str) and Path(path).suffix == '.txt':  # *.txt file with img/vid/dir on each line
+            path = Path(path).read_text().rsplit()
+        files = []
+        for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
+            p = str(Path(p).absolute())  # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
+            if '*' in p:
+                files.extend(sorted(glob.glob(p, recursive=True)))  # glob
+            elif os.path.isdir(p):
+                files.extend(sorted(glob.glob(os.path.join(p, '*.*'))))  # dir
+            elif os.path.isfile(p):
+                files.append(p)  # files
+            else:
+                raise FileNotFoundError(f'{p} does not exist')
+
+        images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS]
+        videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS]
+        ni, nv = len(images), len(videos)
+
+        self.imgsz = imgsz
+        self.files = images + videos
+        self.nf = ni + nv  # number of files
+        self.video_flag = [False] * ni + [True] * nv
+        self.mode = 'image'
+        self.vid_stride = vid_stride  # video frame-rate stride
+        self.bs = 1
+        if any(videos):
+            self.orientation = None  # rotation degrees
+            self._new_video(videos[0])  # new video
+        else:
+            self.cap = None
+        if self.nf == 0:
+            raise FileNotFoundError(f'No images or videos found in {p}. '
+                                    f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}')
+
+    def __iter__(self):
+        """Returns an iterator object for VideoStream or ImageFolder."""
+        self.count = 0
+        return self
+
+    def __next__(self):
+        """Return next image, path and metadata from dataset."""
+        if self.count == self.nf:
+            raise StopIteration
+        path = self.files[self.count]
+
+        if self.video_flag[self.count]:
+            # Read video
+            self.mode = 'video'
+            for _ in range(self.vid_stride):
+                self.cap.grab()
+            success, im0 = self.cap.retrieve()
+            while not success:
+                self.count += 1
+                self.cap.release()
+                if self.count == self.nf:  # last video
+                    raise StopIteration
+                path = self.files[self.count]
+                self._new_video(path)
+                success, im0 = self.cap.read()
+
+            self.frame += 1
+            # im0 = self._cv2_rotate(im0)  # for use if cv2 autorotation is False
+            s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: '
+
+        else:
+            # Read image
+            self.count += 1
+            im0 = cv2.imread(path)  # BGR
+            if im0 is None:
+                raise FileNotFoundError(f'Image Not Found {path}')
+            s = f'image {self.count}/{self.nf} {path}: '
+
+        return [path], [im0], self.cap, s
+
+    def _new_video(self, path):
+        """Create a new video capture object."""
+        self.frame = 0
+        self.cap = cv2.VideoCapture(path)
+        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
+        if hasattr(cv2, 'CAP_PROP_ORIENTATION_META'):  # cv2<4.6.0 compatibility
+            self.orientation = int(self.cap.get(cv2.CAP_PROP_ORIENTATION_META))  # rotation degrees
+            # Disable auto-orientation due to known issues in https://github.com/ultralytics/yolov5/issues/8493
+            # self.cap.set(cv2.CAP_PROP_ORIENTATION_AUTO, 0)
+
+    def _cv2_rotate(self, im):
+        """Rotate a cv2 video manually."""
+        if self.orientation == 0:
+            return cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE)
+        elif self.orientation == 180:
+            return cv2.rotate(im, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        elif self.orientation == 90:
+            return cv2.rotate(im, cv2.ROTATE_180)
+        return im
+
+    def __len__(self):
+        """Returns the number of files in the object."""
+        return self.nf  # number of files
+
+
+class LoadPilAndNumpy:
+
+    def __init__(self, im0, imgsz=640):
+        """Initialize PIL and Numpy Dataloader."""
+        if not isinstance(im0, list):
+            im0 = [im0]
+        self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
+        self.im0 = [self._single_check(im) for im in im0]
+        self.imgsz = imgsz
+        self.mode = 'image'
+        # Generate fake paths
+        self.bs = len(self.im0)
+
+    @staticmethod
+    def _single_check(im):
+        """Validate and format an image to numpy array."""
+        assert isinstance(im, (Image.Image, np.ndarray)), f'Expected PIL/np.ndarray image type, but got {type(im)}'
+        if isinstance(im, Image.Image):
+            if im.mode != 'RGB':
+                im = im.convert('RGB')
+            im = np.asarray(im)[:, :, ::-1]
+            im = np.ascontiguousarray(im)  # contiguous
+        return im
+
+    def __len__(self):
+        """Returns the length of the 'im0' attribute."""
+        return len(self.im0)
+
+    def __next__(self):
+        """Returns batch paths, images, processed images, None, ''."""
+        if self.count == 1:  # loop only once as it's batch inference
+            raise StopIteration
+        self.count += 1
+        return self.paths, self.im0, None, ''
+
+    def __iter__(self):
+        """Enables iteration for class LoadPilAndNumpy."""
+        self.count = 0
+        return self
+
+
+class LoadTensor:
+
+    def __init__(self, im0) -> None:
+        self.im0 = self._single_check(im0)
+        self.bs = self.im0.shape[0]
+        self.mode = 'image'
+        self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
+
+    @staticmethod
+    def _single_check(im, stride=32):
+        """Validate and format an image to torch.Tensor."""
+        s = f'WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) ' \
+            f'divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible.'
+        if len(im.shape) != 4:
+            if len(im.shape) != 3:
+                raise ValueError(s)
+            LOGGER.warning(s)
+            im = im.unsqueeze(0)
+        if im.shape[2] % stride or im.shape[3] % stride:
+            raise ValueError(s)
+        if im.max() > 1.0:
+            LOGGER.warning(f'WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. '
+                           f'Dividing input by 255.')
+            im = im.float() / 255.0
+
+        return im
+
+    def __iter__(self):
+        """Returns an iterator object."""
+        self.count = 0
+        return self
+
+    def __next__(self):
+        """Return next item in the iterator."""
+        if self.count == 1:
+            raise StopIteration
+        self.count += 1
+        return self.paths, self.im0, None, ''
+
+    def __len__(self):
+        """Returns the batch size."""
+        return self.bs
+
+
+def autocast_list(source):
+    """
+    Merges a list of source of different types into a list of numpy arrays or PIL images
+    """
+    files = []
+    for im in source:
+        if isinstance(im, (str, Path)):  # filename or uri
+            files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im))
+        elif isinstance(im, (Image.Image, np.ndarray)):  # PIL or np Image
+            files.append(im)
+        else:
+            raise TypeError(f'type {type(im).__name__} is not a supported Ultralytics prediction source type. \n'
+                            f'See https://docs.ultralytics.com/modes/predict for supported source types.')
+
+    return files
+
+
+LOADERS = [LoadStreams, LoadPilAndNumpy, LoadImages, LoadScreenshots]
+
+
+def get_best_youtube_url(url, use_pafy=True):
+    """
+    Retrieves the URL of the best quality MP4 video stream from a given YouTube video.
+
+    This function uses the pafy or yt_dlp library to extract the video info from YouTube. It then finds the highest
+    quality MP4 format that has video codec but no audio codec, and returns the URL of this video stream.
+
+    Args:
+        url (str): The URL of the YouTube video.
+        use_pafy (bool): Use the pafy package, default=True, otherwise use yt_dlp package.
+
+    Returns:
+        (str): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
+    """
+    if use_pafy:
+        check_requirements(('pafy', 'youtube_dl==2020.12.2'))
+        import pafy  # noqa
+        return pafy.new(url).getbest(preftype='mp4').url
+    else:
+        check_requirements('yt-dlp')
+        import yt_dlp
+        with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
+            info_dict = ydl.extract_info(url, download=False)  # extract info
+        for f in info_dict.get('formats', None):
+            if f['vcodec'] != 'none' and f['acodec'] == 'none' and f['ext'] == 'mp4':
+                return f.get('url', None)
+
+
+if __name__ == '__main__':
+    img = cv2.imread(str(ROOT / 'assets/bus.jpg'))
+    dataset = LoadPilAndNumpy(im0=img)
+    for d in dataset:
+        print(d[0])
--- a/ultralytics/data/scripts/download_weights.sh
+++ b/ultralytics/data/scripts/download_weights.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download latest models from https://github.com/ultralytics/assets/releases
+# Example usage: bash ultralytics/data/scripts/download_weights.sh
+# parent
+# └── weights
+#     ├── yolov8n.pt  ← downloads here
+#     ├── yolov8s.pt
+#     └── ...
+
+python - <<EOF
+from ultralytics.utils.downloads import attempt_download_asset
+
+assets = [f'yolov8{size}{suffix}.pt' for size in 'nsmlx' for suffix in ('', '-cls', '-seg', '-pose')]
+for x in assets:
+    attempt_download_asset(f'weights/{x}')
+
+EOF
--- a/ultralytics/data/scripts/get_coco.sh
+++ b/ultralytics/data/scripts/get_coco.sh
@ -0,0 +1,60 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download COCO 2017 dataset http://cocodataset.org
+# Example usage: bash data/scripts/get_coco.sh
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── coco  ← downloads here
+
+# Arguments (optional) Usage: bash data/scripts/get_coco.sh --train --val --test --segments
+if [ "$#" -gt 0 ]; then
+  for opt in "$@"; do
+    case "${opt}" in
+    --train) train=true ;;
+    --val) val=true ;;
+    --test) test=true ;;
+    --segments) segments=true ;;
+    --sama) sama=true ;;
+    esac
+  done
+else
+  train=true
+  val=true
+  test=false
+  segments=false
+  sama=false
+fi
+
+# Download/unzip labels
+d='../datasets' # unzip directory
+url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
+if [ "$segments" == "true" ]; then
+  f='coco2017labels-segments.zip' # 169 MB
+elif [ "$sama" == "true" ]; then
+  f='coco2017labels-segments-sama.zip' # 199 MB https://www.sama.com/sama-coco-dataset/
+else
+  f='coco2017labels.zip' # 46 MB
+fi
+echo 'Downloading' $url$f ' ...'
+curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+
+# Download/unzip images
+d='../datasets/coco/images' # unzip directory
+url=http://images.cocodataset.org/zips/
+if [ "$train" == "true" ]; then
+  f='train2017.zip' # 19G, 118k images
+  echo 'Downloading' $url$f '...'
+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+fi
+if [ "$val" == "true" ]; then
+  f='val2017.zip' # 1G, 5k images
+  echo 'Downloading' $url$f '...'
+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+fi
+if [ "$test" == "true" ]; then
+  f='test2017.zip' # 7G, 41k images (optional)
+  echo 'Downloading' $url$f '...'
+  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+fi
+wait # finish background tasks
--- a/ultralytics/data/scripts/get_coco128.sh
+++ b/ultralytics/data/scripts/get_coco128.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017)
+# Example usage: bash data/scripts/get_coco128.sh
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── coco128  ← downloads here
+
+# Download/unzip images and labels
+d='../datasets' # unzip directory
+url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
+f='coco128.zip' # or 'coco128-segments.zip', 68 MB
+echo 'Downloading' $url$f ' ...'
+curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
+
+wait # finish background tasks
--- a/ultralytics/data/scripts/get_imagenet.sh
+++ b/ultralytics/data/scripts/get_imagenet.sh
@ -0,0 +1,51 @@
+#!/bin/bash
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Download ILSVRC2012 ImageNet dataset https://image-net.org
+# Example usage: bash data/scripts/get_imagenet.sh
+# parent
+# ├── ultralytics
+# └── datasets
+#     └── imagenet  ← downloads here
+
+# Arguments (optional) Usage: bash data/scripts/get_imagenet.sh --train --val
+if [ "$#" -gt 0 ]; then
+  for opt in "$@"; do
+    case "${opt}" in
+    --train) train=true ;;
+    --val) val=true ;;
+    esac
+  done
+else
+  train=true
+  val=true
+fi
+
+# Make dir
+d='../datasets/imagenet' # unzip directory
+mkdir -p $d && cd $d
+
+# Download/unzip train
+if [ "$train" == "true" ]; then
+  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar # download 138G, 1281167 images
+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+  tar -xf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+  find . -name "*.tar" | while read NAME; do
+    mkdir -p "${NAME%.tar}"
+    tar -xf "${NAME}" -C "${NAME%.tar}"
+    rm -f "${NAME}"
+  done
+  cd ..
+fi
+
+# Download/unzip val
+if [ "$val" == "true" ]; then
+  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar # download 6.3G, 50000 images
+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xf ILSVRC2012_img_val.tar
+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash # move into subdirs
+fi
+
+# Delete corrupted image (optional: PNG under JPEG name that may cause dataloaders to fail)
+# rm train/n04266014/n04266014_10835.JPEG
+
+# TFRecords (optional)
+# wget https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_lsvrc_2015_synsets.txt
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@ -0,0 +1,557 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import contextlib
+import hashlib
+import json
+import os
+import random
+import subprocess
+import time
+import zipfile
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from tarfile import is_tarfile
+
+import cv2
+import numpy as np
+from PIL import ExifTags, Image, ImageOps
+from tqdm import tqdm
+
+from ultralytics.nn.autobackend import check_class_names
+from ultralytics.utils import (DATASETS_DIR, LOGGER, NUM_THREADS, ROOT, SETTINGS_YAML, clean_url, colorstr, emojis,
+                               yaml_load)
+from ultralytics.utils.checks import check_file, check_font, is_ascii
+from ultralytics.utils.downloads import download, safe_download, unzip_file
+from ultralytics.utils.ops import segments2boxes
+
+HELP_URL = 'See https://docs.ultralytics.com/yolov5/tutorials/train_custom_data'
+IMG_FORMATS = 'bmp', 'dng', 'jpeg', 'jpg', 'mpo', 'png', 'tif', 'tiff', 'webp', 'pfm'  # image suffixes
+VID_FORMATS = 'asf', 'avi', 'gif', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'ts', 'wmv', 'webm'  # video suffixes
+PIN_MEMORY = str(os.getenv('PIN_MEMORY', True)).lower() == 'true'  # global pin_memory for dataloaders
+IMAGENET_MEAN = 0.485, 0.456, 0.406  # RGB mean
+IMAGENET_STD = 0.229, 0.224, 0.225  # RGB standard deviation
+
+# Get orientation exif tag
+for orientation in ExifTags.TAGS.keys():
+    if ExifTags.TAGS[orientation] == 'Orientation':
+        break
+
+
+def img2label_paths(img_paths):
+    """Define label paths as a function of image paths."""
+    sa, sb = f'{os.sep}images{os.sep}', f'{os.sep}labels{os.sep}'  # /images/, /labels/ substrings
+    return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths]
+
+
+def get_hash(paths):
+    """Returns a single hash value of a list of paths (files or dirs)."""
+    size = sum(os.path.getsize(p) for p in paths if os.path.exists(p))  # sizes
+    h = hashlib.sha256(str(size).encode())  # hash sizes
+    h.update(''.join(paths).encode())  # hash paths
+    return h.hexdigest()  # return hash
+
+
+def exif_size(img):
+    """Returns exif-corrected PIL size."""
+    s = img.size  # (width, height)
+    with contextlib.suppress(Exception):
+        rotation = dict(img._getexif().items())[orientation]
+        if rotation in [6, 8]:  # rotation 270 or 90
+            s = (s[1], s[0])
+    return s
+
+
+def verify_image_label(args):
+    """Verify one image-label pair."""
+    im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim = args
+    # Number (missing, found, empty, corrupt), message, segments, keypoints
+    nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, '', [], None
+    try:
+        # Verify images
+        im = Image.open(im_file)
+        im.verify()  # PIL verify
+        shape = exif_size(im)  # image size
+        shape = (shape[1], shape[0])  # hw
+        assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
+        assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
+        if im.format.lower() in ('jpg', 'jpeg'):
+            with open(im_file, 'rb') as f:
+                f.seek(-2, 2)
+                if f.read() != b'\xff\xd9':  # corrupt JPEG
+                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100)
+                    msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved'
+
+        # Verify labels
+        if os.path.isfile(lb_file):
+            nf = 1  # label found
+            with open(lb_file) as f:
+                lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
+                if any(len(x) > 6 for x in lb) and (not keypoint):  # is segment
+                    classes = np.array([x[0] for x in lb], dtype=np.float32)
+                    segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb]  # (cls, xy1...)
+                    lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
+                lb = np.array(lb, dtype=np.float32)
+            nl = len(lb)
+            if nl:
+                if keypoint:
+                    assert lb.shape[1] == (5 + nkpt * ndim), f'labels require {(5 + nkpt * ndim)} columns each'
+                    assert (lb[:, 5::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+                    assert (lb[:, 6::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+                else:
+                    assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected'
+                    assert (lb[:, 1:] <= 1).all(), \
+                        f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}'
+                    assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}'
+                # All labels
+                max_cls = int(lb[:, 0].max())  # max label count
+                assert max_cls <= num_cls, \
+                    f'Label class {max_cls} exceeds dataset class count {num_cls}. ' \
+                    f'Possible class labels are 0-{num_cls - 1}'
+                _, i = np.unique(lb, axis=0, return_index=True)
+                if len(i) < nl:  # duplicate row check
+                    lb = lb[i]  # remove duplicates
+                    if segments:
+                        segments = [segments[x] for x in i]
+                    msg = f'{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed'
+            else:
+                ne = 1  # label empty
+                lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros(
+                    (0, 5), dtype=np.float32)
+        else:
+            nm = 1  # label missing
+            lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros((0, 5), dtype=np.float32)
+        if keypoint:
+            keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
+            if ndim == 2:
+                kpt_mask = np.ones(keypoints.shape[:2], dtype=np.float32)
+                kpt_mask = np.where(keypoints[..., 0] < 0, 0.0, kpt_mask)
+                kpt_mask = np.where(keypoints[..., 1] < 0, 0.0, kpt_mask)
+                keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1)  # (nl, nkpt, 3)
+        lb = lb[:, :5]
+        return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
+    except Exception as e:
+        nc = 1
+        msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}'
+        return [None, None, None, None, None, nm, nf, ne, nc, msg]
+
+
+def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
+    """
+    Args:
+        imgsz (tuple): The image size.
+        polygons (list[np.ndarray]): [N, M], N is the number of polygons, M is the number of points(Be divided by 2).
+        color (int): color
+        downsample_ratio (int): downsample ratio
+    """
+    mask = np.zeros(imgsz, dtype=np.uint8)
+    polygons = np.asarray(polygons)
+    polygons = polygons.astype(np.int32)
+    shape = polygons.shape
+    polygons = polygons.reshape(shape[0], -1, 2)
+    cv2.fillPoly(mask, polygons, color=color)
+    nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
+    # NOTE: fillPoly firstly then resize is trying the keep the same way
+    # of loss calculation when mask-ratio=1.
+    mask = cv2.resize(mask, (nw, nh))
+    return mask
+
+
+def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
+    """
+    Args:
+        imgsz (tuple): The image size.
+        polygons (list[np.ndarray]): each polygon is [N, M], N is number of polygons, M is number of points (M % 2 = 0)
+        color (int): color
+        downsample_ratio (int): downsample ratio
+    """
+    masks = []
+    for si in range(len(polygons)):
+        mask = polygon2mask(imgsz, [polygons[si].reshape(-1)], color, downsample_ratio)
+        masks.append(mask)
+    return np.array(masks)
+
+
+def polygons2masks_overlap(imgsz, segments, downsample_ratio=1):
+    """Return a (640, 640) overlap mask."""
+    masks = np.zeros((imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
+                     dtype=np.int32 if len(segments) > 255 else np.uint8)
+    areas = []
+    ms = []
+    for si in range(len(segments)):
+        mask = polygon2mask(imgsz, [segments[si].reshape(-1)], downsample_ratio=downsample_ratio, color=1)
+        ms.append(mask)
+        areas.append(mask.sum())
+    areas = np.asarray(areas)
+    index = np.argsort(-areas)
+    ms = np.array(ms)[index]
+    for i in range(len(segments)):
+        mask = ms[i] * (i + 1)
+        masks = masks + mask
+        masks = np.clip(masks, a_min=0, a_max=i + 1)
+    return masks, index
+
+
+def check_det_dataset(dataset, autodownload=True):
+    """Download, check and/or unzip dataset if not found locally."""
+    data = check_file(dataset)
+
+    # Download (optional)
+    extract_dir = ''
+    if isinstance(data, (str, Path)) and (zipfile.is_zipfile(data) or is_tarfile(data)):
+        new_dir = safe_download(data, dir=DATASETS_DIR, unzip=True, delete=False, curl=False)
+        data = next((DATASETS_DIR / new_dir).rglob('*.yaml'))
+        extract_dir, autodownload = data.parent, False
+
+    # Read yaml (optional)
+    if isinstance(data, (str, Path)):
+        data = yaml_load(data, append_filename=True)  # dictionary
+
+    # Checks
+    for k in 'train', 'val':
+        if k not in data:
+            raise SyntaxError(
+                emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs."))
+    if 'names' not in data and 'nc' not in data:
+        raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs."))
+    if 'names' in data and 'nc' in data and len(data['names']) != data['nc']:
+        raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match."))
+    if 'names' not in data:
+        data['names'] = [f'class_{i}' for i in range(data['nc'])]
+    else:
+        data['nc'] = len(data['names'])
+
+    data['names'] = check_class_names(data['names'])
+
+    # Resolve paths
+    path = Path(extract_dir or data.get('path') or Path(data.get('yaml_file', '')).parent)  # dataset root
+
+    if not path.is_absolute():
+        path = (DATASETS_DIR / path).resolve()
+    data['path'] = path  # download scripts
+    for k in 'train', 'val', 'test':
+        if data.get(k):  # prepend path
+            if isinstance(data[k], str):
+                x = (path / data[k]).resolve()
+                if not x.exists() and data[k].startswith('../'):
+                    x = (path / data[k][3:]).resolve()
+                data[k] = str(x)
+            else:
+                data[k] = [str((path / x).resolve()) for x in data[k]]
+
+    # Parse yaml
+    train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download'))
+    if val:
+        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
+        if not all(x.exists() for x in val):
+            name = clean_url(dataset)  # dataset name with URL auth stripped
+            m = f"\nDataset '{name}' images not found ⚠️, missing paths %s" % [str(x) for x in val if not x.exists()]
+            if s and autodownload:
+                LOGGER.warning(m)
+            else:
+                m += f"\nNote dataset download directory is '{DATASETS_DIR}'. You can update this in '{SETTINGS_YAML}'"
+                raise FileNotFoundError(m)
+            t = time.time()
+            if s.startswith('http') and s.endswith('.zip'):  # URL
+                safe_download(url=s, dir=DATASETS_DIR, delete=True)
+                r = None  # success
+            elif s.startswith('bash '):  # bash script
+                LOGGER.info(f'Running {s} ...')
+                r = os.system(s)
+            else:  # python script
+                r = exec(s, {'yaml': data})  # return None
+            dt = f'({round(time.time() - t, 1)}s)'
+            s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f'failure {dt} ❌'
+            LOGGER.info(f'Dataset download {s}\n')
+    check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf')  # download fonts
+
+    return data  # dictionary
+
+
+def check_cls_dataset(dataset: str, split=''):
+    """
+    Checks a classification dataset such as Imagenet.
+
+    This function accepts a `dataset` name and attempts to retrieve the corresponding dataset information.
+    If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally.
+
+    Args:
+        dataset (str): The name of the dataset.
+        split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''.
+
+    Returns:
+        (dict): A dictionary containing the following keys:
+            - 'train' (Path): The directory path containing the training set of the dataset.
+            - 'val' (Path): The directory path containing the validation set of the dataset.
+            - 'test' (Path): The directory path containing the test set of the dataset.
+            - 'nc' (int): The number of classes in the dataset.
+            - 'names' (dict): A dictionary of class names in the dataset.
+
+    Raises:
+        FileNotFoundError: If the specified dataset is not found and cannot be downloaded.
+    """
+
+    dataset = Path(dataset)
+    data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
+    if not data_dir.is_dir():
+        LOGGER.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...')
+        t = time.time()
+        if str(dataset) == 'imagenet':
+            subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
+        else:
+            url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip'
+            download(url, dir=data_dir.parent)
+        s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
+        LOGGER.info(s)
+    train_set = data_dir / 'train'
+    val_set = data_dir / 'val' if (data_dir / 'val').exists() else None  # data/test or data/val
+    test_set = data_dir / 'test' if (data_dir / 'test').exists() else None  # data/val or data/test
+    if split == 'val' and not val_set:
+        LOGGER.info("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.")
+    elif split == 'test' and not test_set:
+        LOGGER.info("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.")
+
+    nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()])  # number of classes
+    names = [x.name for x in (data_dir / 'train').iterdir() if x.is_dir()]  # class names list
+    names = dict(enumerate(sorted(names)))
+    return {'train': train_set, 'val': val_set or test_set, 'test': test_set or val_set, 'nc': nc, 'names': names}
+
+
+class HUBDatasetStats():
+    """
+    A class for generating HUB dataset JSON and `-hub` dataset directory.
+
+    Args:
+        path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco128.yaml'.
+        task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Default is 'detect'.
+        autodownload (bool): Attempt to download dataset if not found locally. Default is False.
+
+    Usage
+        from ultralytics.data.utils import HUBDatasetStats
+        stats = HUBDatasetStats('/Users/glennjocher/Downloads/coco8.zip', task='detect')  # detect dataset
+        stats = HUBDatasetStats('/Users/glennjocher/Downloads/coco8-seg.zip', task='segment')  # segment dataset
+        stats = HUBDatasetStats('/Users/glennjocher/Downloads/coco8-pose.zip', task='pose')  # pose dataset
+        stats.get_json(save=False)
+        stats.process_images()
+    """
+
+    def __init__(self, path='coco128.yaml', task='detect', autodownload=False):
+        """Initialize class."""
+        LOGGER.info(f'Starting HUB dataset checks for {path}....')
+        zipped, data_dir, yaml_path = self._unzip(Path(path))
+        try:
+            # data = yaml_load(check_yaml(yaml_path))  # data dict
+            data = check_det_dataset(yaml_path, autodownload)  # data dict
+            if zipped:
+                data['path'] = data_dir
+        except Exception as e:
+            raise Exception('error/HUB/dataset_stats/yaml_load') from e
+
+        self.hub_dir = Path(str(data['path']) + '-hub')
+        self.im_dir = self.hub_dir / 'images'
+        self.im_dir.mkdir(parents=True, exist_ok=True)  # makes /images
+        self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())}  # statistics dictionary
+        self.data = data
+        self.task = task  # detect, segment, pose, classify
+
+    @staticmethod
+    def _find_yaml(dir):
+        """Return data.yaml file."""
+        files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml'))  # try root level first and then recursive
+        assert files, f'No *.yaml file found in {dir}'
+        if len(files) > 1:
+            files = [f for f in files if f.stem == dir.stem]  # prefer *.yaml files that match dir name
+            assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed'
+        assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
+        return files[0]
+
+    def _unzip(self, path):
+        """Unzip data.zip."""
+        if not str(path).endswith('.zip'):  # path is data.yaml
+            return False, None, path
+        unzip_dir = unzip_file(path, path=path.parent)
+        assert unzip_dir.is_dir(), f'Error unzipping {path}, {unzip_dir} not found. ' \
+                                   f'path/to/abc.zip MUST unzip to path/to/abc/'
+        return True, str(unzip_dir), self._find_yaml(unzip_dir)  # zipped, data_dir, yaml_path
+
+    def _hub_ops(self, f):
+        """Saves a compressed image for HUB previews."""
+        compress_one_image(f, self.im_dir / Path(f).name)  # save to dataset-hub
+
+    def get_json(self, save=False, verbose=False):
+        """Return dataset JSON for Ultralytics HUB."""
+        from ultralytics.data import YOLODataset  # ClassificationDataset
+
+        def _round(labels):
+            """Update labels to integer class and 4 decimal place floats."""
+            if self.task == 'detect':
+                coordinates = labels['bboxes']
+            elif self.task == 'segment':
+                coordinates = [x.flatten() for x in labels['segments']]
+            elif self.task == 'pose':
+                n = labels['keypoints'].shape[0]
+                coordinates = np.concatenate((labels['bboxes'], labels['keypoints'].reshape(n, -1)), 1)
+            else:
+                raise ValueError('Undefined dataset task.')
+            zipped = zip(labels['cls'], coordinates)
+            return [[int(c), *(round(float(x), 4) for x in points)] for c, points in zipped]
+
+        for split in 'train', 'val', 'test':
+            if self.data.get(split) is None:
+                self.stats[split] = None  # i.e. no test set
+                continue
+
+            dataset = YOLODataset(img_path=self.data[split],
+                                  data=self.data,
+                                  use_segments=self.task == 'segment',
+                                  use_keypoints=self.task == 'pose')
+            x = np.array([
+                np.bincount(label['cls'].astype(int).flatten(), minlength=self.data['nc'])
+                for label in tqdm(dataset.labels, total=len(dataset), desc='Statistics')])  # shape(128x80)
+            self.stats[split] = {
+                'instance_stats': {
+                    'total': int(x.sum()),
+                    'per_class': x.sum(0).tolist()},
+                'image_stats': {
+                    'total': len(dataset),
+                    'unlabelled': int(np.all(x == 0, 1).sum()),
+                    'per_class': (x > 0).sum(0).tolist()},
+                'labels': [{
+                    Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)]}
+
+        # Save, print and return
+        if save:
+            stats_path = self.hub_dir / 'stats.json'
+            LOGGER.info(f'Saving {stats_path.resolve()}...')
+            with open(stats_path, 'w') as f:
+                json.dump(self.stats, f)  # save stats.json
+        if verbose:
+            LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
+        return self.stats
+
+    def process_images(self):
+        """Compress images for Ultralytics HUB."""
+        from ultralytics.data import YOLODataset  # ClassificationDataset
+
+        for split in 'train', 'val', 'test':
+            if self.data.get(split) is None:
+                continue
+            dataset = YOLODataset(img_path=self.data[split], data=self.data)
+            with ThreadPool(NUM_THREADS) as pool:
+                for _ in tqdm(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f'{split} images'):
+                    pass
+        LOGGER.info(f'Done. All images saved to {self.im_dir}')
+        return self.im_dir
+
+
+def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
+    """
+    Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the
+    Python Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will
+    not be resized.
+
+    Args:
+        f (str): The path to the input image file.
+        f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
+        max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels.
+        quality (int, optional): The image compression quality as a percentage. Default is 50%.
+
+    Usage:
+        from pathlib import Path
+        from ultralytics.data.utils import compress_one_image
+        for f in Path('/Users/glennjocher/Downloads/dataset').rglob('*.jpg'):
+            compress_one_image(f)
+    """
+    try:  # use PIL
+        im = Image.open(f)
+        r = max_dim / max(im.height, im.width)  # ratio
+        if r < 1.0:  # image too large
+            im = im.resize((int(im.width * r), int(im.height * r)))
+        im.save(f_new or f, 'JPEG', quality=quality, optimize=True)  # save
+    except Exception as e:  # use OpenCV
+        LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}')
+        im = cv2.imread(f)
+        im_height, im_width = im.shape[:2]
+        r = max_dim / max(im_height, im_width)  # ratio
+        if r < 1.0:  # image too large
+            im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
+        cv2.imwrite(str(f_new or f), im)
+
+
+def delete_dsstore(path):
+    """
+    Deletes all ".DS_store" files under a specified directory.
+
+    Args:
+        path (str, optional): The directory path where the ".DS_store" files should be deleted.
+
+    Usage:
+        from ultralytics.data.utils import delete_dsstore
+        delete_dsstore('/Users/glennjocher/Downloads/dataset')
+
+    Note:
+        ".DS_store" files are created by the Apple operating system and contain metadata about folders and files. They
+        are hidden system files and can cause issues when transferring files between different operating systems.
+    """
+    # Delete Apple .DS_store files
+    files = list(Path(path).rglob('.DS_store'))
+    LOGGER.info(f'Deleting *.DS_store files: {files}')
+    for f in files:
+        f.unlink()
+
+
+def zip_directory(dir, use_zipfile_library=True):
+    """
+    Zips a directory and saves the archive to the specified output path.
+
+    Args:
+        dir (str): The path to the directory to be zipped.
+        use_zipfile_library (bool): Whether to use zipfile library or shutil for zipping.
+
+    Usage:
+        from ultralytics.data.utils import zip_directory
+        zip_directory('/Users/glennjocher/Downloads/playground')
+
+        zip -r coco8-pose.zip coco8-pose
+    """
+    delete_dsstore(dir)
+    if use_zipfile_library:
+        dir = Path(dir)
+        with zipfile.ZipFile(dir.with_suffix('.zip'), 'w', zipfile.ZIP_DEFLATED) as zip_file:
+            for file_path in dir.glob('**/*'):
+                if file_path.is_file():
+                    zip_file.write(file_path, file_path.relative_to(dir))
+    else:
+        import shutil
+        shutil.make_archive(dir, 'zip', dir)
+
+
+def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
+    """
+    Autosplit a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.
+
+    Args:
+        path (Path, optional): Path to images directory. Defaults to DATASETS_DIR / 'coco128/images'.
+        weights (list | tuple, optional): Train, validation, and test split fractions. Defaults to (0.9, 0.1, 0.0).
+        annotated_only (bool, optional): If True, only images with an associated txt file are used. Defaults to False.
+
+    Usage:
+        from utils.dataloaders import autosplit
+        autosplit()
+    """
+
+    path = Path(path)  # images dir
+    files = sorted(x for x in path.rglob('*.*') if x.suffix[1:].lower() in IMG_FORMATS)  # image files only
+    n = len(files)  # number of files
+    random.seed(0)  # for reproducibility
+    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split
+
+    txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt']  # 3 txt files
+    for x in txt:
+        if (path.parent / x).exists():
+            (path.parent / x).unlink()  # remove existing
+
+    LOGGER.info(f'Autosplitting images from {path}' + ', using *.txt labeled images only' * annotated_only)
+    for i, img in tqdm(zip(indices, files), total=n):
+        if not annotated_only or Path(img2label_paths([str(img)])[0]).exists():  # check label
+            with open(path.parent / txt[i], 'a') as f:
+                f.write(f'./{img.relative_to(path.parent).as_posix()}' + '\n')  # add image to txt file