Add RTDETR Trainer (#2745)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: Kayzwer <68285002+Kayzwer@users.noreply.github.com> Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com>
2023-06-17 17:16:18 +05:30
parent 03bce07848
commit a0ba8ef5f0
23 changed files with 989 additions and 314 deletions
--- a/ultralytics/nn/modules/head.py
+++ b/ultralytics/nn/modules/head.py
@ -163,111 +163,178 @@ class RTDETRDecoder(nn.Module):
            self,
            nc=80,
            ch=(512, 1024, 2048),
-            hidden_dim=256,
-            num_queries=300,
-            strides=(8, 16, 32),  # TODO
-            nl=3,
-            num_decoder_points=4,
-            nhead=8,
-            num_decoder_layers=6,
-            dim_feedforward=1024,
+            hd=256,  # hidden dim
+            nq=300,  # num queries
+            ndp=4,  # num decoder points
+            nh=8,  # num head
+            ndl=6,  # num decoder layers
+            d_ffn=1024,  # dim of feedforward
            dropout=0.,
            act=nn.ReLU(),
            eval_idx=-1,
            # training args
-            num_denoising=100,
+            nd=100,  # num denoising
            label_noise_ratio=0.5,
            box_noise_scale=1.0,
            learnt_init_query=False):
        super().__init__()
-        assert len(ch) <= nl
-        assert len(strides) == len(ch)
-        for _ in range(nl - len(strides)):
-            strides.append(strides[-1] * 2)
-
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-        self.feat_strides = strides
-        self.nl = nl
+        self.hidden_dim = hd
+        self.nhead = nh
+        self.nl = len(ch)  # num level
        self.nc = nc
-        self.num_queries = num_queries
-        self.num_decoder_layers = num_decoder_layers
+        self.num_queries = nq
+        self.num_decoder_layers = ndl

        # backbone feature projection
-        self._build_input_proj_layer(ch)
+        self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch)
+        # NOTE: simplified version but it's not consistent with .pt weights.
+        # self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch)

        # Transformer module
-        decoder_layer = DeformableTransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, act, nl,
-                                                          num_decoder_points)
-        self.decoder = DeformableTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx)
+        decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp)
+        self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx)

        # denoising part
-        self.denoising_class_embed = nn.Embedding(nc, hidden_dim)
-        self.num_denoising = num_denoising
+        self.denoising_class_embed = nn.Embedding(nc, hd)
+        self.num_denoising = nd
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
-            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+            self.tgt_embed = nn.Embedding(nq, hd)
+        self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2)

        # encoder head
-        self.enc_output = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim))
-        self.enc_score_head = nn.Linear(hidden_dim, nc)
-        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+        self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd))
+        self.enc_score_head = nn.Linear(hd, nc)
+        self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3)

        # decoder head
-        self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, nc) for _ in range(num_decoder_layers)])
-        self.dec_bbox_head = nn.ModuleList([
-            MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers)])
+        self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)])
+        self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)])

        self._reset_parameters()

-    def forward(self, feats, gt_meta=None):
+    def forward(self, x, batch=None):
+        from ultralytics.vit.utils.ops import get_cdn_group
+
        # input projection and embedding
-        memory, spatial_shapes, _ = self._get_encoder_input(feats)
+        feats, shapes = self._get_encoder_input(x)

        # prepare denoising training
-        if self.training:
-            raise NotImplementedError
-            # denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
-            #     get_contrastive_denoising_training_group(gt_meta,
-            #                                 self.num_classes,
-            #                                 self.num_queries,
-            #                                 self.denoising_class_embed.weight,
-            #                                 self.num_denoising,
-            #                                 self.label_noise_ratio,
-            #                                 self.box_noise_scale)
-        else:
-            denoising_class, denoising_bbox_unact, attn_mask = None, None, None
+        dn_embed, dn_bbox, attn_mask, dn_meta = \
+            get_cdn_group(batch,
+                          self.nc,
+                          self.num_queries,
+                          self.denoising_class_embed.weight,
+                          self.num_denoising,
+                          self.label_noise_ratio,
+                          self.box_noise_scale,
+                          self.training)

-        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
-            self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact)
+        embed, refer_bbox, enc_bboxes, enc_scores = \
+            self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)

        # decoder
-        out_bboxes, out_logits = self.decoder(target,
-                                              init_ref_points_unact,
-                                              memory,
-                                              spatial_shapes,
+        dec_bboxes, dec_scores = self.decoder(embed,
+                                              refer_bbox,
+                                              feats,
+                                              shapes,
                                              self.dec_bbox_head,
                                              self.dec_score_head,
                                              self.query_pos_head,
                                              attn_mask=attn_mask)
        if not self.training:
-            out_logits = out_logits.sigmoid_()
-        return out_bboxes, out_logits  # enc_topk_bboxes, enc_topk_logits, dn_meta
+            dec_scores = dec_scores.sigmoid_()
+        return dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta

+    def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
+        anchors = []
+        for i, (h, w) in enumerate(shapes):
+            grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=dtype, device=device),
+                                            torch.arange(end=w, dtype=dtype, device=device),
+                                            indexing='ij')
+            grid_xy = torch.stack([grid_x, grid_y], -1)  # (h, w, 2)
+
+            valid_WH = torch.tensor([h, w], dtype=dtype, device=device)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH  # (1, h, w, 2)
+            wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
+            anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4))  # (1, h*w, 4)
+
+        anchors = torch.cat(anchors, 1)  # (1, h*w*nl, 4)
+        valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)  # 1, h*w*nl, 1
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.inf)
+        return anchors, valid_mask
+
+    def _get_encoder_input(self, x):
+        # get projection features
+        x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
+        # get encoder inputs
+        feats = []
+        shapes = []
+        for feat in x:
+            h, w = feat.shape[2:]
+            # [b, c, h, w] -> [b, h*w, c]
+            feats.append(feat.flatten(2).permute(0, 2, 1))
+            # [nl, 2]
+            shapes.append([h, w])
+
+        # [b, h*w, c]
+        feats = torch.cat(feats, 1)
+        return feats, shapes
+
+    def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
+        bs = len(feats)
+        # prepare input for decoder
+        anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
+        features = self.enc_output(torch.where(valid_mask, feats, 0))  # bs, h*w, 256
+
+        enc_outputs_scores = self.enc_score_head(features)  # (bs, h*w, nc)
+        # dynamic anchors + static content
+        enc_outputs_bboxes = self.enc_bbox_head(features) + anchors  # (bs, h*w, 4)
+
+        # query selection
+        # (bs, num_queries)
+        topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1)
+        # (bs, num_queries)
+        batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
+
+        # Unsigmoided
+        refer_bbox = enc_outputs_bboxes[batch_ind, topk_ind].view(bs, self.num_queries, -1)
+        # refer_bbox = torch.gather(enc_outputs_bboxes, 1, topk_ind.reshape(bs, self.num_queries).unsqueeze(-1).repeat(1, 1, 4))
+
+        enc_bboxes = refer_bbox.sigmoid()
+        if dn_bbox is not None:
+            refer_bbox = torch.cat([dn_bbox, refer_bbox], 1)
+        if self.training:
+            refer_bbox = refer_bbox.detach()
+        enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1)
+
+        if self.learnt_init_query:
+            embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
+        else:
+            embeddings = features[batch_ind, topk_ind].view(bs, self.num_queries, -1)
+            if self.training:
+                embeddings = embeddings.detach()
+        if dn_embed is not None:
+            embeddings = torch.cat([dn_embed, embeddings], 1)
+
+        return embeddings, refer_bbox, enc_bboxes, enc_scores
+
+    # TODO
    def _reset_parameters(self):
        # class and bbox head init
-        bias_cls = bias_init_with_prob(0.01)
-        linear_init_(self.enc_score_head)
+        bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
+        # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
+        # linear_init_(self.enc_score_head)
        constant_(self.enc_score_head.bias, bias_cls)
        constant_(self.enc_bbox_head.layers[-1].weight, 0.)
        constant_(self.enc_bbox_head.layers[-1].bias, 0.)
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
-            linear_init_(cls_)
+            # linear_init_(cls_)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.layers[-1].weight, 0.)
            constant_(reg_.layers[-1].bias, 0.)
@ -280,103 +347,3 @@ class RTDETRDecoder(nn.Module):
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        for layer in self.input_proj:
            xavier_uniform_(layer[0].weight)
-
-    def _build_input_proj_layer(self, ch):
-        self.input_proj = nn.ModuleList()
-        for in_channels in ch:
-            self.input_proj.append(
-                nn.Sequential(nn.Conv2d(in_channels, self.hidden_dim, kernel_size=1, bias=False),
-                              nn.BatchNorm2d(self.hidden_dim)))
-        in_channels = ch[-1]
-        for _ in range(self.nl - len(ch)):
-            self.input_proj.append(
-                nn.Sequential(nn.Conv2D(in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias=False),
-                              nn.BatchNorm2d(self.hidden_dim)))
-            in_channels = self.hidden_dim
-
-    def _generate_anchors(self, spatial_shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
-        anchors = []
-        for lvl, (h, w) in enumerate(spatial_shapes):
-            grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=torch.float32),
-                                            torch.arange(end=w, dtype=torch.float32),
-                                            indexing='ij')
-            grid_xy = torch.stack([grid_x, grid_y], -1)
-
-            valid_WH = torch.tensor([h, w]).to(torch.float32)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
-            wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
-            anchors.append(torch.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
-
-        anchors = torch.concat(anchors, 1)
-        valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
-        anchors = torch.log(anchors / (1 - anchors))
-        anchors = torch.where(valid_mask, anchors, torch.inf)
-        return anchors.to(device=device, dtype=dtype), valid_mask.to(device=device)
-
-    def _get_encoder_input(self, feats):
-        # get projection features
-        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
-        if self.nl > len(proj_feats):
-            len_srcs = len(proj_feats)
-            for i in range(len_srcs, self.nl):
-                if i == len_srcs:
-                    proj_feats.append(self.input_proj[i](feats[-1]))
-                else:
-                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
-
-        # get encoder inputs
-        feat_flatten = []
-        spatial_shapes = []
-        level_start_index = [0]
-        for feat in proj_feats:
-            _, _, h, w = feat.shape
-            # [b, c, h, w] -> [b, h*w, c]
-            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
-            # [nl, 2]
-            spatial_shapes.append([h, w])
-            # [l], start index of each level
-            level_start_index.append(h * w + level_start_index[-1])
-
-        # [b, l, c]
-        feat_flatten = torch.concat(feat_flatten, 1)
-        level_start_index.pop()
-        return feat_flatten, spatial_shapes, level_start_index
-
-    def _get_decoder_input(self, memory, spatial_shapes, denoising_class=None, denoising_bbox_unact=None):
-        bs, _, _ = memory.shape
-        # prepare input for decoder
-        anchors, valid_mask = self._generate_anchors(spatial_shapes, dtype=memory.dtype, device=memory.device)
-        memory = torch.where(valid_mask, memory, 0)
-        output_memory = self.enc_output(memory)
-
-        enc_outputs_class = self.enc_score_head(output_memory)  # (bs, h*w, nc)
-        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors  # (bs, h*w, 4)
-
-        # (bs, topk)
-        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1)
-        # extract region proposal boxes
-        # (bs, topk_ind)
-        batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
-        topk_ind = topk_ind.view(-1)
-
-        # Unsigmoided
-        reference_points_unact = enc_outputs_coord_unact[batch_ind, topk_ind].view(bs, self.num_queries, -1)
-
-        enc_topk_bboxes = torch.sigmoid(reference_points_unact)
-        if denoising_bbox_unact is not None:
-            reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
-        if self.training:
-            reference_points_unact = reference_points_unact.detach()
-        enc_topk_logits = enc_outputs_class[batch_ind, topk_ind].view(bs, self.num_queries, -1)
-
-        # extract region features
-        if self.learnt_init_query:
-            target = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
-        else:
-            target = output_memory[batch_ind, topk_ind].view(bs, self.num_queries, -1)
-            if self.training:
-                target = target.detach()
-        if denoising_class is not None:
-            target = torch.concat([denoising_class, target], 1)
-
-        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
--- a/ultralytics/nn/modules/transformer.py
+++ b/ultralytics/nn/modules/transformer.py
@ -229,23 +229,23 @@ class MSDeformAttn(nn.Module):
        xavier_uniform_(self.output_proj.weight.data)
        constant_(self.output_proj.bias.data, 0.)

-    def forward(self, query, reference_points, value, value_spatial_shapes, value_mask=None):
+    def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
        """
        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
        Args:
-            query (Tensor): [bs, query_length, C]
-            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+            query (torch.Tensor): [bs, query_length, C]
+            refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
-            value (Tensor): [bs, value_length, C]
-            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value (torch.Tensor): [bs, value_length, C]
+            value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        """
        bs, len_q = query.shape[:2]
-        _, len_v = value.shape[:2]
-        assert sum(s[0] * s[1] for s in value_spatial_shapes) == len_v
+        len_v = value.shape[1]
+        assert sum(s[0] * s[1] for s in value_shapes) == len_v

        value = self.value_proj(value)
        if value_mask is not None:
@ -255,18 +255,17 @@ class MSDeformAttn(nn.Module):
        attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
        attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
        # N, Len_q, n_heads, n_levels, n_points, 2
-        n = reference_points.shape[-1]
-        if n == 2:
-            offset_normalizer = torch.as_tensor(value_spatial_shapes, dtype=query.dtype, device=query.device).flip(-1)
+        num_points = refer_bbox.shape[-1]
+        if num_points == 2:
+            offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
            add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
-            sampling_locations = reference_points[:, :, None, :, None, :] + add
-
-        elif n == 4:
-            add = sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
-            sampling_locations = reference_points[:, :, None, :, None, :2] + add
+            sampling_locations = refer_bbox[:, :, None, :, None, :] + add
+        elif num_points == 4:
+            add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
+            sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
        else:
-            raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {n}.')
-        output = multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights)
+            raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
+        output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
        output = self.output_proj(output)
        return output

@ -308,33 +307,24 @@ class DeformableTransformerDecoderLayer(nn.Module):
        tgt = self.norm3(tgt)
        return tgt

-    def forward(self,
-                tgt,
-                reference_points,
-                src,
-                src_spatial_shapes,
-                src_padding_mask=None,
-                attn_mask=None,
-                query_pos=None):
+    def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
        # self attention
-        q = k = self.with_pos_embed(tgt, query_pos)
-        if attn_mask is not None:
-            attn_mask = torch.where(attn_mask.astype('bool'), torch.zeros(attn_mask.shape, tgt.dtype),
-                                    torch.full(attn_mask.shape, float('-inf'), tgt.dtype))
-        tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
+        q = k = self.with_pos_embed(embed, query_pos)
+        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
+                             attn_mask=attn_mask)[0].transpose(0, 1)
+        embed = embed + self.dropout1(tgt)
+        embed = self.norm1(embed)

        # cross attention
-        tgt2 = self.cross_attn(self.with_pos_embed(tgt, query_pos), reference_points, src, src_spatial_shapes,
-                               src_padding_mask)
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
+        tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
+                              padding_mask)
+        embed = embed + self.dropout2(tgt)
+        embed = self.norm2(embed)

        # ffn
-        tgt = self.forward_ffn(tgt)
+        embed = self.forward_ffn(embed)

-        return tgt
+        return embed


 class DeformableTransformerDecoder(nn.Module):
@ -349,41 +339,40 @@ class DeformableTransformerDecoder(nn.Module):
        self.hidden_dim = hidden_dim
        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx

-    def forward(self,
-                tgt,
-                reference_points,
-                src,
-                src_spatial_shapes,
-                bbox_head,
-                score_head,
-                query_pos_head,
-                attn_mask=None,
-                src_padding_mask=None):
-        output = tgt
-        dec_out_bboxes = []
-        dec_out_logits = []
-        ref_points = None
-        ref_points_detach = torch.sigmoid(reference_points)
+    def forward(
+            self,
+            embed,  # decoder embeddings
+            refer_bbox,  # anchor
+            feats,  # image features
+            shapes,  # feature shapes
+            bbox_head,
+            score_head,
+            pos_mlp,
+            attn_mask=None,
+            padding_mask=None):
+        output = embed
+        dec_bboxes = []
+        dec_cls = []
+        last_refined_bbox = None
+        refer_bbox = refer_bbox.sigmoid()
        for i, layer in enumerate(self.layers):
-            ref_points_input = ref_points_detach.unsqueeze(2)
-            query_pos_embed = query_pos_head(ref_points_detach)
-            output = layer(output, ref_points_input, src, src_spatial_shapes, src_padding_mask, attn_mask,
-                           query_pos_embed)
+            output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))

-            inter_ref_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
+            # refine bboxes, (bs, num_queries+num_denoising, 4)
+            refined_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(refer_bbox))

            if self.training:
-                dec_out_logits.append(score_head[i](output))
+                dec_cls.append(score_head[i](output))
                if i == 0:
-                    dec_out_bboxes.append(inter_ref_bbox)
+                    dec_bboxes.append(refined_bbox)
                else:
-                    dec_out_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
+                    dec_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(last_refined_bbox)))
            elif i == self.eval_idx:
-                dec_out_logits.append(score_head[i](output))
-                dec_out_bboxes.append(inter_ref_bbox)
+                dec_cls.append(score_head[i](output))
+                dec_bboxes.append(refined_bbox)
                break

-            ref_points = inter_ref_bbox
-            ref_points_detach = inter_ref_bbox.detach() if self.training else inter_ref_bbox
+            last_refined_bbox = refined_bbox
+            refer_bbox = refined_bbox.detach() if self.training else refined_bbox

-        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
+        return torch.stack(dec_bboxes), torch.stack(dec_cls)