diff --git a/ultralytics/models/v6/yolov6.yaml b/ultralytics/models/v6/yolov6.yaml index a26a3df..cb5e32a 100644 --- a/ultralytics/models/v6/yolov6.yaml +++ b/ultralytics/models/v6/yolov6.yaml @@ -2,8 +2,8 @@ # YOLOv6 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/models/yolov6 # Parameters -act: nn.ReLU() nc: 80 # number of classes +activation: nn.ReLU() # (optional) model default activation function scales: # model compound scaling constants, i.e. 'model=yolov6n.yaml' will call yolov8.yaml with scale 'n' # [depth, width, max_channels] n: [0.33, 0.25, 1024] diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py index bfe226e..38fd3fc 100644 --- a/ultralytics/nn/tasks.py +++ b/ultralytics/nn/tasks.py @@ -422,9 +422,7 @@ class RTDETRDetectionModel(DetectionModel): # NOTE: preprocess gt_bbox and gt_labels to list. bs = len(img) batch_idx = batch['batch_idx'] - gt_groups = [] - for i in range(bs): - gt_groups.append((batch_idx == i).sum().item()) + gt_groups = [(batch_idx == i).sum().item() for i in range(bs)] targets = { 'cls': batch['cls'].to(img.device, dtype=torch.long).view(-1), 'bboxes': batch['bboxes'].to(device=img.device), @@ -606,7 +604,7 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3) # Args max_channels = float('inf') - nc, act, scales = (d.get(x) for x in ('nc', 'act', 'scales')) + nc, act, scales = (d.get(x) for x in ('nc', 'activation', 'scales')) depth, width, kpt_shape = (d.get(x, 1.0) for x in ('depth_multiple', 'width_multiple', 'kpt_shape')) if scales: scale = d.get('scale') diff --git a/ultralytics/vit/sam/modules/decoders.py b/ultralytics/vit/sam/modules/decoders.py index 43a2932..cadc0f0 100644 --- a/ultralytics/vit/sam/modules/decoders.py +++ b/ultralytics/vit/sam/modules/decoders.py @@ -22,20 +22,15 @@ class MaskDecoder(nn.Module): iou_head_hidden_dim: int = 256, ) -> None: """ - Predicts masks given an image and prompt embeddings, using a - transformer architecture. + Predicts masks given an image and prompt embeddings, using a transformer architecture. Arguments: - transformer_dim (int): the channel dimension of the transformer - transformer (nn.Module): the transformer used to predict masks - num_multimask_outputs (int): the number of masks to predict - when disambiguating masks - activation (nn.Module): the type of activation to use when - upscaling masks - iou_head_depth (int): the depth of the MLP used to predict - mask quality - iou_head_hidden_dim (int): the hidden dimension of the MLP - used to predict mask quality + transformer_dim (int): the channel dimension of the transformer module + transformer (nn.Module): the transformer used to predict masks + num_multimask_outputs (int): the number of masks to predict when disambiguating masks + activation (nn.Module): the type of activation to use when upscaling masks + iou_head_depth (int): the depth of the MLP used to predict mask quality + iou_head_hidden_dim (int): the hidden dimension of the MLP used to predict mask quality """ super().__init__() self.transformer_dim = transformer_dim @@ -71,16 +66,15 @@ class MaskDecoder(nn.Module): Predict masks given image and prompt embeddings. Arguments: - image_embeddings (torch.Tensor): the embeddings from the image encoder - image_pe (torch.Tensor): positional encoding with the shape of image_embeddings - sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes - dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs - multimask_output (bool): Whether to return multiple masks or a single - mask. + image_embeddings (torch.Tensor): the embeddings from the image encoder + image_pe (torch.Tensor): positional encoding with the shape of image_embeddings + sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes + dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs + multimask_output (bool): Whether to return multiple masks or a single mask. Returns: - torch.Tensor: batched predicted masks - torch.Tensor: batched predictions of mask quality + torch.Tensor: batched predicted masks + torch.Tensor: batched predictions of mask quality """ masks, iou_pred = self.predict_masks( image_embeddings=image_embeddings, @@ -136,9 +130,11 @@ class MaskDecoder(nn.Module): return masks, iou_pred -# Lightly adapted from -# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa class MLP(nn.Module): + """ + Lightly adapted from + https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py + """ def __init__( self, diff --git a/ultralytics/vit/utils/ops.py b/ultralytics/vit/utils/ops.py index 164c874..6585987 100644 --- a/ultralytics/vit/utils/ops.py +++ b/ultralytics/vit/utils/ops.py @@ -249,7 +249,7 @@ def get_cdn_group(batch, attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * 2 * i] = True dn_meta = { - 'dn_pos_idx': [p.reshape(-1) for p in pos_idx.cpu().split([n for n in gt_groups], dim=1)], + 'dn_pos_idx': [p.reshape(-1) for p in pos_idx.cpu().split(list(gt_groups), dim=1)], 'dn_num_group': num_group, 'dn_num_split': [num_dn, num_queries]} @@ -258,5 +258,6 @@ def get_cdn_group(batch, def inverse_sigmoid(x, eps=1e-6): + """Inverse sigmoid function.""" x = x.clip(min=0., max=1.) return torch.log(x / (1 - x + eps) + eps) diff --git a/ultralytics/yolo/cfg/default.yaml b/ultralytics/yolo/cfg/default.yaml index 35be2e9..24c9b17 100644 --- a/ultralytics/yolo/cfg/default.yaml +++ b/ultralytics/yolo/cfg/default.yaml @@ -1,117 +1,117 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license # Default training settings and hyperparameters for medium-augmentation COCO training -task: detect # YOLO task, i.e. detect, segment, classify, pose -mode: train # YOLO mode, i.e. train, val, predict, export, track, benchmark +task: detect # (str) YOLO task, i.e. detect, segment, classify, pose +mode: train # (str) YOLO mode, i.e. train, val, predict, export, track, benchmark # Train settings ------------------------------------------------------------------------------------------------------- -model: # path to model file, i.e. yolov8n.pt, yolov8n.yaml -data: # path to data file, i.e. coco128.yaml -epochs: 100 # number of epochs to train for -patience: 50 # epochs to wait for no observable improvement for early stopping of training -batch: 16 # number of images per batch (-1 for AutoBatch) -imgsz: 640 # size of input images as integer or w,h -save: True # save train checkpoints and predict results -save_period: -1 # Save checkpoint every x epochs (disabled if < 1) -cache: False # True/ram, disk or False. Use cache for data loading -device: # device to run on, i.e. cuda device=0 or device=0,1,2,3 or device=cpu -workers: 8 # number of worker threads for data loading (per RANK if DDP) -project: # project name -name: # experiment name, results saved to 'project/name' directory -exist_ok: False # whether to overwrite existing experiment -pretrained: False # whether to use a pretrained model -optimizer: auto # optimizer to use, choices=[SGD, Adam, Adamax, AdamW, NAdam, RAdam, RMSProp, auto] -verbose: True # whether to print verbose output -seed: 0 # random seed for reproducibility -deterministic: True # whether to enable deterministic mode -single_cls: False # train multi-class data as single-class -rect: False # rectangular training if mode='train' or rectangular validation if mode='val' -cos_lr: False # use cosine learning rate scheduler +model: # (str, optional) path to model file, i.e. yolov8n.pt, yolov8n.yaml +data: # (str, optional) path to data file, i.e. coco128.yaml +epochs: 100 # (int) number of epochs to train for +patience: 50 # (int) epochs to wait for no observable improvement for early stopping of training +batch: 16 # (int) number of images per batch (-1 for AutoBatch) +imgsz: 640 # (int) size of input images as integer or w,h +save: True # (bool) save train checkpoints and predict results +save_period: -1 # (int) Save checkpoint every x epochs (disabled if < 1) +cache: False # (bool) True/ram, disk or False. Use cache for data loading +device: # (int | str | list, optional) device to run on, i.e. cuda device=0 or device=0,1,2,3 or device=cpu +workers: 8 # (int) number of worker threads for data loading (per RANK if DDP) +project: # (str, optional) project name +name: # (str, optional) experiment name, results saved to 'project/name' directory +exist_ok: False # (bool) whether to overwrite existing experiment +pretrained: False # (bool) whether to use a pretrained model +optimizer: auto # (str) optimizer to use, choices=[SGD, Adam, Adamax, AdamW, NAdam, RAdam, RMSProp, auto] +verbose: True # (bool) whether to print verbose output +seed: 0 # (int) random seed for reproducibility +deterministic: True # (bool) whether to enable deterministic mode +single_cls: False # (bool) train multi-class data as single-class +rect: False # (bool) rectangular training if mode='train' or rectangular validation if mode='val' +cos_lr: False # (bool) use cosine learning rate scheduler close_mosaic: 0 # (int) disable mosaic augmentation for final epochs -resume: False # resume training from last checkpoint -amp: True # Automatic Mixed Precision (AMP) training, choices=[True, False], True runs AMP check -fraction: 1.0 # dataset fraction to train on (default is 1.0, all images in train set) -profile: False # profile ONNX and TensorRT speeds during training for loggers +resume: False # (bool) resume training from last checkpoint +amp: True # (bool) Automatic Mixed Precision (AMP) training, choices=[True, False], True runs AMP check +fraction: 1.0 # (float) dataset fraction to train on (default is 1.0, all images in train set) +profile: False # (bool) profile ONNX and TensorRT speeds during training for loggers # Segmentation -overlap_mask: True # masks should overlap during training (segment train only) -mask_ratio: 4 # mask downsample ratio (segment train only) +overlap_mask: True # (bool) masks should overlap during training (segment train only) +mask_ratio: 4 # (int) mask downsample ratio (segment train only) # Classification -dropout: 0.0 # use dropout regularization (classify train only) +dropout: 0.0 # (float) use dropout regularization (classify train only) # Val/Test settings ---------------------------------------------------------------------------------------------------- -val: True # validate/test during training -split: val # dataset split to use for validation, i.e. 'val', 'test' or 'train' -save_json: False # save results to JSON file -save_hybrid: False # save hybrid version of labels (labels + additional predictions) -conf: # object confidence threshold for detection (default 0.25 predict, 0.001 val) -iou: 0.7 # intersection over union (IoU) threshold for NMS -max_det: 300 # maximum number of detections per image -half: False # use half precision (FP16) -dnn: False # use OpenCV DNN for ONNX inference -plots: True # save plots during train/val +val: True # (bool) validate/test during training +split: val # (str) dataset split to use for validation, i.e. 'val', 'test' or 'train' +save_json: False # (bool) save results to JSON file +save_hybrid: False # (bool) save hybrid version of labels (labels + additional predictions) +conf: # (float, optional) object confidence threshold for detection (default 0.25 predict, 0.001 val) +iou: 0.7 # (float) intersection over union (IoU) threshold for NMS +max_det: 300 # (int) maximum number of detections per image +half: False # (bool) use half precision (FP16) +dnn: False # (bool) use OpenCV DNN for ONNX inference +plots: True # (bool) save plots during train/val # Prediction settings -------------------------------------------------------------------------------------------------- -source: # source directory for images or videos -show: False # show results if possible -save_txt: False # save results as .txt file -save_conf: False # save results with confidence scores -save_crop: False # save cropped images with results -show_labels: True # show object labels in plots -show_conf: True # show object confidence scores in plots -vid_stride: 1 # video frame-rate stride -line_width: # line width of the bounding boxes -visualize: False # visualize model features -augment: False # apply image augmentation to prediction sources -agnostic_nms: False # class-agnostic NMS -classes: # filter results by class, i.e. class=0, or class=[0,2,3] -retina_masks: False # use high-resolution segmentation masks -boxes: True # Show boxes in segmentation predictions +source: # (str, optional) source directory for images or videos +show: False # (bool) show results if possible +save_txt: False # (bool) save results as .txt file +save_conf: False # (bool) save results with confidence scores +save_crop: False # (bool) save cropped images with results +show_labels: True # (bool) show object labels in plots +show_conf: True # (bool) show object confidence scores in plots +vid_stride: 1 # (int) video frame-rate stride +line_width: # (int, optional) line width of the bounding boxes, auto if missing +visualize: False # (bool) visualize model features +augment: False # (bool) apply image augmentation to prediction sources +agnostic_nms: False # (bool) class-agnostic NMS +classes: # (int | list[int], optional) filter results by class, i.e. class=0, or class=[0,2,3] +retina_masks: False # (bool) use high-resolution segmentation masks +boxes: True # (bool) Show boxes in segmentation predictions # Export settings ------------------------------------------------------------------------------------------------------ -format: torchscript # format to export to -keras: False # use Keras -optimize: False # TorchScript: optimize for mobile -int8: False # CoreML/TF INT8 quantization -dynamic: False # ONNX/TF/TensorRT: dynamic axes -simplify: False # ONNX: simplify model -opset: # ONNX: opset version (optional) -workspace: 4 # TensorRT: workspace size (GB) -nms: False # CoreML: add NMS +format: torchscript # (str) format to export to, choices at https://docs.ultralytics.com/modes/export/#export-formats +keras: False # (bool) use Kera=s +optimize: False # (bool) TorchScript: optimize for mobile +int8: False # (bool) CoreML/TF INT8 quantization +dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes +simplify: False # (bool) ONNX: simplify model +opset: # (int, optional) ONNX: opset version +workspace: 4 # (int) TensorRT: workspace size (GB) +nms: False # (bool) CoreML: add NMS # Hyperparameters ------------------------------------------------------------------------------------------------------ -lr0: 0.01 # initial learning rate (i.e. SGD=1E-2, Adam=1E-3) -lrf: 0.01 # final learning rate (lr0 * lrf) -momentum: 0.937 # SGD momentum/Adam beta1 -weight_decay: 0.0005 # optimizer weight decay 5e-4 -warmup_epochs: 3.0 # warmup epochs (fractions ok) -warmup_momentum: 0.8 # warmup initial momentum -warmup_bias_lr: 0.1 # warmup initial bias lr -box: 7.5 # box loss gain -cls: 0.5 # cls loss gain (scale with pixels) -dfl: 1.5 # dfl loss gain -pose: 12.0 # pose loss gain -kobj: 1.0 # keypoint obj loss gain -label_smoothing: 0.0 # label smoothing (fraction) -nbs: 64 # nominal batch size -hsv_h: 0.015 # image HSV-Hue augmentation (fraction) -hsv_s: 0.7 # image HSV-Saturation augmentation (fraction) -hsv_v: 0.4 # image HSV-Value augmentation (fraction) -degrees: 0.0 # image rotation (+/- deg) -translate: 0.1 # image translation (+/- fraction) -scale: 0.5 # image scale (+/- gain) -shear: 0.0 # image shear (+/- deg) -perspective: 0.0 # image perspective (+/- fraction), range 0-0.001 -flipud: 0.0 # image flip up-down (probability) -fliplr: 0.5 # image flip left-right (probability) -mosaic: 1.0 # image mosaic (probability) -mixup: 0.0 # image mixup (probability) -copy_paste: 0.0 # segment copy-paste (probability) +lr0: 0.01 # (float) initial learning rate (i.e. SGD=1E-2, Adam=1E-3) +lrf: 0.01 # (float) final learning rate (lr0 * lrf) +momentum: 0.937 # (float) SGD momentum/Adam beta1 +weight_decay: 0.0005 # (float) optimizer weight decay 5e-4 +warmup_epochs: 3.0 # (float) warmup epochs (fractions ok) +warmup_momentum: 0.8 # (float) warmup initial momentum +warmup_bias_lr: 0.1 # (float) warmup initial bias lr +box: 7.5 # (float) box loss gain +cls: 0.5 # (float) cls loss gain (scale with pixels) +dfl: 1.5 # (float) dfl loss gain +pose: 12.0 # (float) pose loss gain +kobj: 1.0 # (float) keypoint obj loss gain +label_smoothing: 0.0 # (float) label smoothing (fraction) +nbs: 64 # (int) nominal batch size +hsv_h: 0.015 # (float) image HSV-Hue augmentation (fraction) +hsv_s: 0.7 # (float) image HSV-Saturation augmentation (fraction) +hsv_v: 0.4 # (float) image HSV-Value augmentation (fraction) +degrees: 0.0 # (float) image rotation (+/- deg) +translate: 0.1 # (float) image translation (+/- fraction) +scale: 0.5 # (float) image scale (+/- gain) +shear: 0.0 # (float) image shear (+/- deg) +perspective: 0.0 # (float) image perspective (+/- fraction), range 0-0.001 +flipud: 0.0 # (float) image flip up-down (probability) +fliplr: 0.5 # (float) image flip left-right (probability) +mosaic: 1.0 # (float) image mosaic (probability) +mixup: 0.0 # (float) image mixup (probability) +copy_paste: 0.0 # (float) segment copy-paste (probability) # Custom config.yaml --------------------------------------------------------------------------------------------------- -cfg: # for overriding defaults.yaml +cfg: # (str, optional) for overriding defaults.yaml # Debug, do not modify ------------------------------------------------------------------------------------------------- -v5loader: False # use legacy YOLOv5 dataloader +v5loader: False # (bool) use legacy YOLOv5 dataloader (deprecated) # Tracker settings ------------------------------------------------------------------------------------------------------ -tracker: botsort.yaml # tracker type, ['botsort.yaml', 'bytetrack.yaml'] +tracker: botsort.yaml # (str) tracker type, choices=[botsort.yaml, bytetrack.yaml]