From 8996c5c6cf6470109bed60573ac7edc75de9f4c4 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Mon, 2 Jan 2023 20:42:30 +0530 Subject: [PATCH] [Docs]: Link buttons, add autobackend, BaseModel and ops (#130) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher --- README.md | 5 +- docs/quickstart.md | 9 +- docs/reference/nn.md | 15 ++ docs/reference/ops.md | 162 +++++++++++++++++++ docs/sdk.md | 15 +- mkdocs.yml | 7 +- ultralytics/__init__.py | 1 + ultralytics/nn/autobackend.py | 76 +++++++-- ultralytics/nn/tasks.py | 72 ++++++++- ultralytics/yolo/utils/ops.py | 292 +++++++++++++++++++++++++++------- 10 files changed, 560 insertions(+), 94 deletions(-) create mode 100644 docs/reference/nn.md create mode 100644 docs/reference/ops.md diff --git a/README.md b/README.md index 1dcfe92..d4993b6 100644 --- a/README.md +++ b/README.md @@ -34,11 +34,10 @@ To use pythonic interface of Ultralytics YOLO model ```python from ultralytics import YOLO -model = YOLO.new("yolov8n.yaml") # create a new model from scratch -model = YOLO.load( +model = YOLO("yolov8n.yaml") # create a new model from scratch +model = YOLO( "yolov8n.pt" ) # load a pretrained model (recommended for best training results) - results = model.train(data="coco128.yaml", epochs=100, imgsz=640, ...) results = model.val() results = model.predict(source="bus.jpg") diff --git a/docs/quickstart.md b/docs/quickstart.md index 42d0f2a..d73ac8a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -33,7 +33,7 @@ CLI requires no customization or code. You can simply run all tasks from the ter ```bash yolo task=detect mode=train model=s.yaml device=\'0,1,2,3\' ``` -[CLI Guide](#){ .md-button .md-button--primary} +[CLI Guide](cli.md){ .md-button .md-button--primary} ## Python API Ultralytics YOLO comes with pythonic Model and Trainer interface. @@ -42,10 +42,9 @@ Ultralytics YOLO comes with pythonic Model and Trainer interface. import ultralytics from ultralytics import YOLO - model = YOLO() - model.new("s-seg.yaml") # automatically detects task type - model.load("s-seg.pt") # load checkpoint + model = YOLO("s-seg.yaml") # automatically detects task type + model = YOLO("s-seg.pt") # load checkpoint model.train(data="coco128-segments", epochs=1, lr0=0.01, ...) model.train(data="coco128-segments", epochs=1, lr0=0.01, device="0,1,2,3") # DDP mode ``` -[API Guide](#){ .md-button .md-button--primary} +[API Guide](sdk.md){ .md-button .md-button--primary} diff --git a/docs/reference/nn.md b/docs/reference/nn.md new file mode 100644 index 0000000..8a66fce --- /dev/null +++ b/docs/reference/nn.md @@ -0,0 +1,15 @@ +# nn Module +Ultralytics nn module contains 3 main components: + +1. **AutoBackend**: A module that can run inference on all popular model formats +2. **BaseModel**: `BaseModel` class defines the operations supported by tasks like Detection and Segmentation +3. **modules**: Optimized and reusable neural network blocks built on PyTorch. + +## AutoBackend +:::ultralytics.nn.autobackend.AutoBackend + +## BaseModel +:::ultralytics.nn.tasks.BaseModel + +## Modules +TODO \ No newline at end of file diff --git a/docs/reference/ops.md b/docs/reference/ops.md new file mode 100644 index 0000000..ed85005 --- /dev/null +++ b/docs/reference/ops.md @@ -0,0 +1,162 @@ +This module contains optimized deep learning related operations used in the Ultralytics YOLO framework +## Non-max suppression +:::ultralytics.ops.non_max_suppression + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## Scale boxes +:::ultralytics.ops.scale_boxes + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## Scale image +:::ultralytics.ops.scale_image + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## clip boxes +:::ultralytics.ops.clip_boxes + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +# Box Format Conversion +## xyxy2xywh +:::ultralytics.ops.xyxy2xywh + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## xywh2xyxy +:::ultralytics.ops.xywh2xyxy + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## xywhn2xyxy +:::ultralytics.ops.xywhn2xyxy + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## xyxy2xywhn +:::ultralytics.ops.xyxy2xywhn + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## xyn2xy +:::ultralytics.ops.xyn2xy + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## xywh2ltwh +:::ultralytics.ops.xywh2ltwh + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## xyxy2ltwh +:::ultralytics.ops.xyxy2ltwh + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## ltwh2xywh +:::ultralytics.ops.ltwh2xywh + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## ltwh2xyxy +:::ultralytics.ops.ltwh2xyxy + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## segment2box +:::ultralytics.ops.segment2box + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +# Mask Operations +## resample_segments +:::ultralytics.ops.resample_segments + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## crop_mask +:::ultralytics.ops.crop_mask + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## process_mask_upsample +:::ultralytics.ops.process_mask_upsample + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## process_mask +:::ultralytics.ops.process_mask + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## process_mask_native +:::ultralytics.ops.process_mask_native + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## scale_segments +:::ultralytics.ops.scale_segments + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## masks2segments +:::ultralytics.ops.masks2segments + handler: python + options: + show_source: false + show_root_toc_entry: false +--- +## clip_segments +:::ultralytics.ops.clip_segments + handler: python + options: + show_source: false + show_root_toc_entry: false +--- + + + + + diff --git a/docs/sdk.md b/docs/sdk.md index 148f9fd..f858d8c 100644 --- a/docs/sdk.md +++ b/docs/sdk.md @@ -6,8 +6,7 @@ This is the simplest way of simply using yolo models in a python environment. It ```python from ultralytics import YOLO - model = YOLO() - model.new("n.yaml") # pass any model type + model = YOLO("yolov8n.yaml") model(img_tensor) # Or model.forward(). inference. model.train(data="coco128.yaml", epochs=5) ``` @@ -16,10 +15,9 @@ This is the simplest way of simply using yolo models in a python environment. It ```python from ultralytics import YOLO - model = YOLO() - model.load("n.pt") # pass any model type + model = YOLO("yolov8n.pt") # pass any model type model(...) # inference - model.train(data="coco128.yaml", epochs=5) + model.train(epochs=5) ``` === "Resume Training" @@ -35,8 +33,7 @@ This is the simplest way of simply using yolo models in a python environment. It ```python from ultralytics import YOLO - model = YOLO() - model.load("model.pt") + model = YOLO("model.pt") model.predict(source="0") # accepts all formats - img/folder/vid.*(mp4/format). 0 for webcam model.predict(source="folder", view_img=True) # Display preds. Accepts all yolo predict arguments @@ -48,7 +45,7 @@ This is the simplest way of simply using yolo models in a python environment. It ```python from ultralytics import YOLO - model = YOLO() + model = YOLO("model.pt") model.fuse() model.info(verbose=True) # Print model information model.export(format=) # TODO: @@ -61,7 +58,7 @@ This is the simplest way of simply using yolo models in a python environment. It To know more about using `YOLO` models, refer Model class refernce -[Model reference](#){ .md-button .md-button--primary} +[Model reference](reference/model.md){ .md-button .md-button--primary} --- ### Customizing Tasks with Trainers diff --git a/mkdocs.yml b/mkdocs.yml index 9e42752..764e08d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,6 +51,7 @@ markdown_extensions: - pymdownx.superfences - tables - attr_list + - def_list # Syntax highlight - pymdownx.highlight: anchor_linenums: true @@ -84,14 +85,16 @@ nav: - Detection: tasks/detection.md - Segmentation: tasks/segmentation.md - Classification: tasks/classification.md - - Customization Tutorials: + - Advanced Tutorials: - Customize Trainer: customize/train.md - Customize Validator: customize/val.md - Customize Predictor: customize/predict.md - Reference: - - YOLO Models: reference/model.md + - Python Model interface: reference/model.md - Engine: - Trainer: reference/base_trainer.md - Validator: reference/base_val.md - Predictor: reference/base_pred.md - Exporter: reference/exporter.md + - nn Module: reference/nn.md + - operations: reference/ops.md diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index 8ca0777..46aaedc 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,5 +1,6 @@ __version__ = "8.0.0.dev0" from ultralytics.yolo.engine.model import YOLO +from ultralytics.yolo.utils import ops __all__ = ["__version__", "YOLO", "hub"] # allow simpler import diff --git a/ultralytics/nn/autobackend.py b/ultralytics/nn/autobackend.py index d4ad519..0b50e52 100644 --- a/ultralytics/nn/autobackend.py +++ b/ultralytics/nn/autobackend.py @@ -17,22 +17,36 @@ from ultralytics.yolo.utils.ops import xywh2xyxy class AutoBackend(nn.Module): - # YOLOv5 MultiBackend class for python inference on various backends + def __init__(self, weights='yolov8n.pt', device=torch.device('cpu'), dnn=False, data=None, fp16=False, fuse=True): - # Usage: - # PyTorch: weights = *.pt - # TorchScript: *.torchscript - # ONNX Runtime: *.onnx - # ONNX OpenCV DNN: *.onnx --dnn - # OpenVINO: *.xml - # CoreML: *.mlmodel - # TensorRT: *.engine - # TensorFlow SavedModel: *_saved_model - # TensorFlow GraphDef: *.pb - # TensorFlow Lite: *.tflite - # TensorFlow Edge TPU: *_edgetpu.tflite - # PaddlePaddle: *_paddle_model + """ + Ultralytics YOLO MultiBackend class for python inference on various backends + + Args: + weights: the path to the weights file. Defaults to yolov8n.pt + device: The device to run the model on. + dnn: If you want to use OpenCV's DNN module to run the inference, set this to True. Defaults to + False + data: a dictionary containing the following keys: + fp16: If true, will use half precision. Defaults to False + fuse: whether to fuse the model or not. Defaults to True + Supported format and their usage: + | Platform | weights | + |-----------------------|------------------| + | PyTorch | *.pt | + | TorchScript | *.torchscript | + | ONNX Runtime | *.onnx | + | ONNX OpenCV DNN | *.onnx --dnn | + | OpenVINO | *.xml | + | CoreML | *.mlmodel | + | TensorRT | *.engine | + | TensorFlow SavedModel | *_saved_model | + | TensorFlow GraphDef | *.pb | + | TensorFlow Lite | *.tflite | + | TensorFlow Edge TPU | *_edgetpu.tflite | + | PaddlePaddle | *_paddle_model | + """ super().__init__() w = str(weights[0] if isinstance(weights, list) else weights) nn_module = isinstance(weights, torch.nn.Module) @@ -215,6 +229,15 @@ class AutoBackend(nn.Module): self.__dict__.update(locals()) # assign all variables to self def forward(self, im, augment=False, visualize=False): + """ + Runs inference on the given model + + Args: + im: the image tensor + augment: whether to augment the image. Defaults to False + visualize: if True, then the network will output the feature maps of the last convolutional layer. + Defaults to False + """ # YOLOv5 MultiBackend inference b, ch, h, w = im.shape # batch, channel, height, width if self.fp16 and im.dtype != torch.float16: @@ -297,10 +320,21 @@ class AutoBackend(nn.Module): return self.from_numpy(y) def from_numpy(self, x): + """ + `from_numpy` converts a numpy array to a tensor + + Args: + x: the numpy array to convert + """ return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x def warmup(self, imgsz=(1, 3, 640, 640)): - # Warmup model by running inference once + """ + Warmup model by running inference once + + Args: + imgsz: the size of the image you want to run inference on. + """ warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module if any(warmup_types) and (self.device.type != 'cpu' or self.triton): im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input @@ -309,6 +343,12 @@ class AutoBackend(nn.Module): @staticmethod def _model_type(p='path/to/model.pt'): + """ + This function takes a path to a model file and returns the model type + + Args: + p: path to the model file. Defaults to path/to/model.pt + """ # Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle] from ultralytics.yolo.engine.exporter import export_formats @@ -323,6 +363,12 @@ class AutoBackend(nn.Module): @staticmethod def _load_metadata(f=Path('path/to/meta.yaml')): + """ + > Loads the metadata from a yaml file + + Args: + f: The path to the metadata file. + """ from ultralytics.yolo.utils.files import yaml_load # Load metadata from meta.yaml if it exists diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py index 7edbb53..afe0bfa 100644 --- a/ultralytics/nn/tasks.py +++ b/ultralytics/nn/tasks.py @@ -17,11 +17,36 @@ from ultralytics.yolo.utils.torch_utils import (fuse_conv_and_bn, initialize_wei class BaseModel(nn.Module): - # YOLOv5 base model + ''' + The BaseModel class is a base class for all the models in the Ultralytics YOLO family. + ''' + def forward(self, x, profile=False, visualize=False): - return self._forward_once(x, profile, visualize) # single-scale inference, train + """ + > `forward` is a wrapper for `_forward_once` that runs the model on a single scale + + Args: + x: the input image + profile: whether to profile the model. Defaults to False + visualize: if True, will return the intermediate feature maps. Defaults to False + + Returns: + The output of the network. + """ + return self._forward_once(x, profile, visualize) def _forward_once(self, x, profile=False, visualize=False): + """ + > Forward pass of the network + + Args: + x: input to the model + profile: if True, the time taken for each layer will be printed. Defaults to False + visualize: If True, it will save the feature maps of the model. Defaults to False + + Returns: + The last layer of the model. + """ y, dt = [], [] # outputs for m in self.model: if m.f != -1: # if not from previous layer @@ -36,6 +61,15 @@ class BaseModel(nn.Module): return x def _profile_one_layer(self, m, x, dt): + """ + It takes a model, an input, and a list of times, and it profiles the model on the input, appending + the time to the list + + Args: + m: the model + x: the input image + dt: list of time taken for each layer + """ c = m == self.model[-1] # is final layer, copy input as inplace fix o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPs t = time_sync() @@ -48,7 +82,13 @@ class BaseModel(nn.Module): if c: LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s} Total") - def fuse(self): # fuse model Conv2d() + BatchNorm2d() layers + def fuse(self): + """ + > It takes a model and fuses the Conv2d() and BatchNorm2d() layers into a single layer + + Returns: + The model is being returned. + """ LOGGER.info('Fusing layers... ') for m in self.model.modules(): if isinstance(m, (Conv, DWConv)) and hasattr(m, 'bn'): @@ -58,11 +98,27 @@ class BaseModel(nn.Module): self.info() return self - def info(self, verbose=False, imgsz=640): # print model information + def info(self, verbose=False, imgsz=640): + """ + Prints model information + + Args: + verbose: if True, prints out the model information. Defaults to False + imgsz: the size of the image that the model will be trained on. Defaults to 640 + """ model_info(self, verbose, imgsz) def _apply(self, fn): - # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers + """ + `_apply()` is a function that applies a function to all the tensors in the model that are not + parameters or registered buffers + + Args: + fn: the function to apply to the model + + Returns: + A model that is a Detect() object. + """ self = super()._apply(fn) m = self.model[-1] # Detect() if isinstance(m, (Detect, Segment)): @@ -72,6 +128,12 @@ class BaseModel(nn.Module): return self def load(self, weights): + """ + > This function loads the weights of the model from a file + + Args: + weights: The weights to load into the model. + """ # Force all tasks to implement this function raise NotImplementedError("This function needs to be implemented by derived classes!") diff --git a/ultralytics/yolo/utils/ops.py b/ultralytics/yolo/utils/ops.py index 41b0db0..edb32b0 100644 --- a/ultralytics/yolo/utils/ops.py +++ b/ultralytics/yolo/utils/ops.py @@ -47,6 +47,17 @@ def coco80_to_coco91_class(): # converts 80-index (val2014) to 91-index (paper) def segment2box(segment, width=640, height=640): + """ + > Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to + (xyxy) + Args: + segment: the segment label + width: the width of the image. Defaults to 640 + height: The height of the image. Defaults to 640 + + Returns: + the minimum and maximum x and y values of the segment. + """ # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy) x, y = segment.T # segment xy inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) @@ -55,7 +66,18 @@ def segment2box(segment, width=640, height=640): def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None): - # Rescale boxes (xyxy) from img1_shape to img0_shape + """ + > Rescale boxes (xyxy) from img1_shape to img0_shape + Args: + img1_shape: The shape of the image that the bounding boxes are for. + boxes: the bounding boxes of the objects in the image + img0_shape: the shape of the original image + ratio_pad: a tuple of (ratio, pad) + + Returns: + The boxes are being returned. + """ + # if ratio_pad is None: # calculate from img0_shape gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding @@ -70,18 +92,6 @@ def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None): return boxes -def clip_boxes(boxes, shape): - # Clip boxes (xyxy) to image shape (height, width) - if isinstance(boxes, torch.Tensor): # faster individually - boxes[..., 0].clamp_(0, shape[1]) # x1 - boxes[..., 1].clamp_(0, shape[0]) # y1 - boxes[..., 2].clamp_(0, shape[1]) # x2 - boxes[..., 3].clamp_(0, shape[0]) # y2 - else: # np.array (faster grouped) - boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2 - boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2 - - def make_divisible(x, divisor): # Returns nearest x divisible by divisor if isinstance(divisor, torch.Tensor): @@ -101,7 +111,7 @@ def non_max_suppression( nm=0, # number of masks ): """ - Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box. + > Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box. Arguments: prediction (torch.Tensor): A tensor of shape (batch_size, num_boxes, num_classes + 4 + num_masks) @@ -217,6 +227,25 @@ def non_max_suppression( return output +def clip_boxes(boxes, shape): + """ + > It takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the + shape + + Args: + boxes: the bounding boxes to clip + shape: the shape of the image + """ + if isinstance(boxes, torch.Tensor): # faster individually + boxes[..., 0].clamp_(0, shape[1]) # x1 + boxes[..., 1].clamp_(0, shape[0]) # y1 + boxes[..., 2].clamp_(0, shape[1]) # x2 + boxes[..., 3].clamp_(0, shape[0]) # y2 + else: # np.array (faster grouped) + boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2 + boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2 + + def clip_coords(boxes, shape): # Clip bounding xyxy bounding boxes to image shape (height, width) if isinstance(boxes, torch.Tensor): # faster individually @@ -231,9 +260,16 @@ def clip_coords(boxes, shape): def scale_image(im1_shape, masks, im0_shape, ratio_pad=None): """ - img1_shape: model input shape, [h, w] - img0_shape: origin pic shape, [h, w, 3] - masks: [h, w, num] + > It takes a mask, and resizes it to the original image size + + Args: + im1_shape: model input shape, [h, w] + masks: [h, w, num] + im0_shape: the original image shape + ratio_pad: the ratio of the padding to the original image. + + Returns: + The masks are being returned. """ # Rescale coordinates (xyxy) from im1_shape to im0_shape if ratio_pad is None: # calculate from im0_shape @@ -258,7 +294,16 @@ def scale_image(im1_shape, masks, im0_shape, ratio_pad=None): def xyxy2xywh(x): - # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right + """ + > It takes a list of bounding boxes, and converts them from the format [x1, y1, x2, y2] to [x, y, w, + h] where xy1=top-left, xy2=bottom-right + + Args: + x: the input tensor + + Returns: + the center of the box, the width and the height of the box. + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center @@ -268,7 +313,15 @@ def xyxy2xywh(x): def xywh2xyxy(x): - # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + """ + > It converts the bounding box from x,y,w,h to x1,y1,x2,y2 where xy1=top-left, xy2=bottom-right + + Args: + x: the input tensor + + Returns: + the top left and bottom right coordinates of the bounding box. + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = x[..., 0] - x[..., 2] / 2 # top left x y[..., 1] = x[..., 1] - x[..., 3] / 2 # top left y @@ -278,7 +331,19 @@ def xywh2xyxy(x): def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0): - # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + """ + > It converts the normalized coordinates to the actual coordinates [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + + Args: + x: the bounding box coordinates + w: width of the image. Defaults to 640 + h: height of the image. Defaults to 640 + padw: padding width. Defaults to 0 + padh: height of the padding. Defaults to 0 + + Returns: + the xyxy coordinates of the bounding box. + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw # top left x y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh # top left y @@ -288,7 +353,20 @@ def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0): def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0): - # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right + """ + > It takes in a list of bounding boxes, and returns a list of bounding boxes, but with the x and y + coordinates normalized to the width and height of the image + + Args: + x: the bounding box coordinates + w: width of the image. Defaults to 640 + h: height of the image. Defaults to 640 + clip: If True, the boxes will be clipped to the image boundaries. Defaults to False + eps: the minimum value of the box's width and height. + + Returns: + the xywhn format of the bounding boxes. + """ if clip: clip_boxes(x, (h - eps, w - eps)) # warning: inplace clip y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) @@ -300,7 +378,19 @@ def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0): def xyn2xy(x, w=640, h=640, padw=0, padh=0): - # Convert normalized segments into pixel segments, shape (n,2) + """ + > It converts normalized segments into pixel segments of shape (n,2) + + Args: + x: the normalized coordinates of the bounding box + w: width of the image. Defaults to 640 + h: height of the image. Defaults to 640 + padw: padding width. Defaults to 0 + padh: padding height. Defaults to 0 + + Returns: + the x and y coordinates of the top left corner of the bounding box. + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = w * x[..., 0] + padw # top left x y[..., 1] = h * x[..., 1] + padh # top left y @@ -308,7 +398,15 @@ def xyn2xy(x, w=640, h=640, padw=0, padh=0): def xywh2ltwh(x): - # Convert nx4 boxes from [x, y, w, h] to [x1, y1, w, h] where xy1=top-left + """ + > It converts the bounding box from [x, y, w, h] to [x1, y1, w, h] where xy1=top-left + + Args: + x: the x coordinate of the center of the bounding box + + Returns: + the top left x and y coordinates of the bounding box. + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y @@ -316,7 +414,15 @@ def xywh2ltwh(x): def xyxy2ltwh(x): - # Convert nx4 boxes from [x1, y1, x2, y2] to [x1, y1, w, h] where xy1=top-left, xy2=bottom-right + """ + > Convert nx4 boxes from [x1, y1, x2, y2] to [x1, y1, w, h] where xy1=top-left, xy2=bottom-right + + Args: + x: the input tensor + + Returns: + the xyxy2ltwh function. + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[:, 2] = x[:, 2] - x[:, 0] # width y[:, 3] = x[:, 3] - x[:, 1] # height @@ -324,7 +430,12 @@ def xyxy2ltwh(x): def ltwh2xywh(x): - # Convert nx4 boxes from [x1, y1, w, h] to [x, y, w, h] where xy1=top-left, xy=center + """ + > Convert nx4 boxes from [x1, y1, w, h] to [x, y, w, h] where xy1=top-left, xy=center + + Args: + x: the input tensor + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[:, 0] = x[:, 0] + x[:, 2] / 2 # center x y[:, 1] = x[:, 1] + x[:, 3] / 2 # center y @@ -332,7 +443,16 @@ def ltwh2xywh(x): def ltwh2xyxy(x): - # Convert nx4 boxes from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + """ + > It converts the bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, + xy2=bottom-right + + Args: + x: the input image + + Returns: + the xyxy coordinates of the bounding boxes. + """ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[:, 2] = x[:, 2] + x[:, 0] # width y[:, 3] = x[:, 3] + x[:, 1] # height @@ -340,7 +460,16 @@ def ltwh2xyxy(x): def segments2boxes(segments): - # Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh) + """ + > It converts segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh) + + Args: + segments: list of segments, each segment is a list of points, each point is a list of x, y + coordinates + + Returns: + the xywh coordinates of the bounding boxes. + """ boxes = [] for s in segments: x, y = s.T # segment xy @@ -349,7 +478,17 @@ def segments2boxes(segments): def resample_segments(segments, n=1000): - # Up-sample an (n,2) segment + """ + > It takes a list of segments (n,2) and returns a list of segments (n,2) where each segment has been + up-sampled to n points + + Args: + segments: a list of (n,2) arrays, where n is the number of points in the segment. + n: number of points to resample the segment to. Defaults to 1000 + + Returns: + the resampled segments. + """ for i, s in enumerate(segments): s = np.concatenate((s, s[0:1, :]), axis=0) x = np.linspace(0, len(s) - 1, n) @@ -360,13 +499,15 @@ def resample_segments(segments, n=1000): def crop_mask(masks, boxes): """ - "Crop" predicted masks by zeroing out everything not in the predicted bbox. - Vectorized by Chong (thanks Chong). + > It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box + Args: - - masks should be a size [h, w, n] tensor of masks - - boxes should be a size [n, 4] tensor of bbox coords in relative point form - """ + masks: [h, w, n] tensor of masks + boxes: [n, 4] tensor of bbox coords in relative point form + Returns: + The masks are being cropped to the bounding box. + """ n, h, w = masks.shape x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) @@ -377,14 +518,18 @@ def crop_mask(masks, boxes): def process_mask_upsample(protos, masks_in, bboxes, shape): """ - Crop after upsample. - proto_out: [mask_dim, mask_h, mask_w] - out_masks: [n, mask_dim], n is number of masks after nms - bboxes: [n, 4], n is number of masks after nms - shape:input_image_size, (h, w) - return: h, w, n - """ + > It takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher + quality but is slower. + + Args: + protos: [mask_dim, mask_h, mask_w] + masks_in: [n, mask_dim], n is number of masks after nms + bboxes: [n, 4], n is number of masks after nms + shape: the size of the input image + Returns: + mask + """ c, mh, mw = protos.shape # CHW masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW @@ -394,12 +539,17 @@ def process_mask_upsample(protos, masks_in, bboxes, shape): def process_mask(protos, masks_in, bboxes, shape, upsample=False): """ - Crop before upsample. - proto_out: [mask_dim, mask_h, mask_w] - out_masks: [n, mask_dim], n is number of masks after nms - bboxes: [n, 4], n is number of masks after nms - shape:input_image_size, (h, w) - return: h, w, n + > It takes the output of the mask head, and applies the mask to the bounding boxes. This is faster but produces + downsampled quality of mask + + Args: + protos: [mask_dim, mask_h, mask_w] + masks_in: [n, mask_dim], n is number of masks after nms + bboxes: [n, 4], n is number of masks after nms + shape: the size of the input image + + Returns: + mask """ c, mh, mw = protos.shape # CHW @@ -420,12 +570,16 @@ def process_mask(protos, masks_in, bboxes, shape, upsample=False): def process_mask_native(protos, masks_in, bboxes, shape): """ - Crop after upsample. - protos: [mask_dim, mask_h, mask_w] - masks_in: [n, mask_dim], n is number of masks after nms - bboxes: [n, 4], n is number of masks after nms - shape: input_image_size, (h, w) - return: h, w, n + > It takes the output of the mask head, and crops it after upsampling to the bounding boxes. + + Args: + protos: [mask_dim, mask_h, mask_w] + masks_in: [n, mask_dim], n is number of masks after nms + bboxes: [n, 4], n is number of masks after nms + shape: input_image_size, (h, w) + + Returns: + masks: [h, w, n] """ c, mh, mw = protos.shape # CHW masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) @@ -441,7 +595,19 @@ def process_mask_native(protos, masks_in, bboxes, shape): def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None, normalize=False): - # Rescale coords (xyxy) from img1_shape to img0_shape + """ + > Rescale segment coords (xyxy) from img1_shape to img0_shape + + Args: + img1_shape: The shape of the image that the segments are from. + segments: the segments to be scaled + img0_shape: the shape of the image that the segmentation is being applied to + ratio_pad: the ratio of the image size to the padded image size. + normalize: If True, the coordinates will be normalized to the range [0, 1]. Defaults to False + + Returns: + the segmented image. + """ if ratio_pad is None: # calculate from img0_shape gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding @@ -460,7 +626,16 @@ def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None, normalize=F def masks2segments(masks, strategy='largest'): - # Convert masks(n,160,160) into segments(n,xy) + """ + > It takes a list of masks(n,h,w) and returns a list of segments(n,xy) + + Args: + masks: the output of the model, which is a tensor of shape (batch_size, 160, 160) + strategy: 'concat' or 'largest'. Defaults to largest + + Returns: + segments (List): list of segment masks + """ segments = [] for x in masks.int().cpu().numpy().astype('uint8'): c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0] @@ -476,7 +651,14 @@ def masks2segments(masks, strategy='largest'): def clip_segments(segments, shape): - # Clip segments (xy1,xy2,...) to image shape (height, width) + """ + > It takes a list of line segments (x1,y1,x2,y2) and clips them to the image shape (height, width) + + Args: + segments: a list of segments, each segment is a list of points, each point is a list of x,y + coordinates + shape: the shape of the image + """ if isinstance(segments, torch.Tensor): # faster individually segments[:, 0].clamp_(0, shape[1]) # x segments[:, 1].clamp_(0, shape[0]) # y