ultralytics 8.0.90 actions and docs improvements (#2326)

Co-authored-by: calmisential <xinyu_std@163.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: triple Mu <gpu@163.com> Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com> Co-authored-by: Ayush Chaurasia <ayush.chaurarsia@gmail.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: ran xiao <ben.xiao@me.com> Co-authored-by: rxiao <ran.xiao@silverpond.com.au>
2023-04-29 20:16:56 +02:00
parent 243fc4b1fe
commit 44c7c3514d
39 changed files with 783 additions and 143 deletions
--- a/ultralytics/vit/sam/modules/encoders.py
+++ b/ultralytics/vit/sam/modules/encoders.py
@ -233,10 +233,10 @@ class PromptEncoder(nn.Module):
        embeddings.

        Arguments:
-          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+          points (tuple(torch.Tensor, torch.Tensor), None): point coordinates
            and labels to embed.
-          boxes (torch.Tensor or none): boxes to embed
-          masks (torch.Tensor or none): masks to embed
+          boxes (torch.Tensor, None): boxes to embed
+          masks (torch.Tensor, None): masks to embed

        Returns:
          torch.Tensor: sparse embeddings for the points and boxes, with shape
@ -337,7 +337,7 @@ class Block(nn.Module):
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks. If it equals 0, then
                use global attention.
-            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+            input_size (tuple(int, int), None): Input resolution for calculating the relative
                positional parameter size.
        """
        super().__init__()
@ -392,9 +392,8 @@ class Attention(nn.Module):
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
-            rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
-            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+            input_size (tuple(int, int), None): Input resolution for calculating the relative
                positional parameter size.
        """
        super().__init__()
--- a/ultralytics/vit/sam/modules/mask_generator.py
+++ b/ultralytics/vit/sam/modules/mask_generator.py
@ -45,7 +45,7 @@ class SamAutomaticMaskGenerator:

        Arguments:
          model (Sam): The SAM model to use for mask prediction.
-          points_per_side (int or None): The number of points to be sampled
+          points_per_side (int, None): The number of points to be sampled
            along one side of the image. The total number of points is
            points_per_side**2. If None, 'point_grids' must provide explicit
            point sampling.
@ -70,7 +70,7 @@ class SamAutomaticMaskGenerator:
            the image length. Later layers with more crops scale down this overlap.
          crop_n_points_downscale_factor (int): The number of points-per-side
            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
-          point_grids (list(np.ndarray) or None): A list over explicit grids
+          point_grids (list(np.ndarray), None): A list over explicit grids
            of points used for sampling, normalized to [0,1]. The nth grid in the
            list is used in the nth crop layer. Exclusive with points_per_side.
          min_mask_region_area (int): If >0, postprocessing will be applied
@ -128,9 +128,8 @@ class SamAutomaticMaskGenerator:
          image (np.ndarray): The image to generate masks for, in HWC uint8 format.

        Returns:
-           list(dict(str, any)): A list over records for masks. Each record is
-             a dict containing the following keys:
-               segmentation (dict(str, any) or np.ndarray): The mask. If
+           list(dict(str, any)): A list over records for masks. Each record is a dict containing the following keys:
+               segmentation (dict(str, any), np.ndarray): The mask. If
                 output_mode='binary_mask', is an array of shape HW. Otherwise,
                 is a dictionary containing the RLE.
               bbox (list(float)): The box around the mask, in XYWH format.
--- a/ultralytics/vit/sam/modules/prompt_predictor.py
+++ b/ultralytics/vit/sam/modules/prompt_predictor.py
@ -81,12 +81,12 @@ class PromptPredictor:
        Predict masks for the given input prompts, using the currently set image.

        Arguments:
-          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+          point_coords (np.ndarray, None): A Nx2 array of point prompts to the
            model. Each point is in (X,Y) in pixels.
-          point_labels (np.ndarray or None): A length N array of labels for the
+          point_labels (np.ndarray, None): A length N array of labels for the
            point prompts. 1 indicates a foreground point and 0 indicates a
            background point.
-          box (np.ndarray or None): A length 4 array given a box prompt to the
+          box (np.ndarray, None): A length 4 array given a box prompt to the
            model, in XYXY format.
          mask_input (np.ndarray): A low resolution mask input to the model, typically
            coming from a previous prediction iteration. Has form 1xHxW, where
@ -158,12 +158,12 @@ class PromptPredictor:
        transformed to the input frame using ResizeLongestSide.

        Arguments:
-          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+          point_coords (torch.Tensor, None): A BxNx2 array of point prompts to the
            model. Each point is in (X,Y) in pixels.
-          point_labels (torch.Tensor or None): A BxN array of labels for the
+          point_labels (torch.Tensor, None): A BxN array of labels for the
            point prompts. 1 indicates a foreground point and 0 indicates a
            background point.
-          boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+          boxes (np.ndarray, None): A Bx4 array given a box prompt to the
            model, in XYXY format.
          mask_input (np.ndarray): A low resolution mask input to the model, typically
            coming from a previous prediction iteration. Has form Bx1xHxW, where