Add RTDETR Trainer (#2745)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: Laughing-q <1185102784@qq.com> Co-authored-by: Kayzwer <68285002+Kayzwer@users.noreply.github.com> Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com>
This commit is contained in:
@ -163,111 +163,178 @@ class RTDETRDecoder(nn.Module):
|
||||
self,
|
||||
nc=80,
|
||||
ch=(512, 1024, 2048),
|
||||
hidden_dim=256,
|
||||
num_queries=300,
|
||||
strides=(8, 16, 32), # TODO
|
||||
nl=3,
|
||||
num_decoder_points=4,
|
||||
nhead=8,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
hd=256, # hidden dim
|
||||
nq=300, # num queries
|
||||
ndp=4, # num decoder points
|
||||
nh=8, # num head
|
||||
ndl=6, # num decoder layers
|
||||
d_ffn=1024, # dim of feedforward
|
||||
dropout=0.,
|
||||
act=nn.ReLU(),
|
||||
eval_idx=-1,
|
||||
# training args
|
||||
num_denoising=100,
|
||||
nd=100, # num denoising
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0,
|
||||
learnt_init_query=False):
|
||||
super().__init__()
|
||||
assert len(ch) <= nl
|
||||
assert len(strides) == len(ch)
|
||||
for _ in range(nl - len(strides)):
|
||||
strides.append(strides[-1] * 2)
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.feat_strides = strides
|
||||
self.nl = nl
|
||||
self.hidden_dim = hd
|
||||
self.nhead = nh
|
||||
self.nl = len(ch) # num level
|
||||
self.nc = nc
|
||||
self.num_queries = num_queries
|
||||
self.num_decoder_layers = num_decoder_layers
|
||||
self.num_queries = nq
|
||||
self.num_decoder_layers = ndl
|
||||
|
||||
# backbone feature projection
|
||||
self._build_input_proj_layer(ch)
|
||||
self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch)
|
||||
# NOTE: simplified version but it's not consistent with .pt weights.
|
||||
# self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch)
|
||||
|
||||
# Transformer module
|
||||
decoder_layer = DeformableTransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, act, nl,
|
||||
num_decoder_points)
|
||||
self.decoder = DeformableTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx)
|
||||
decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp)
|
||||
self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx)
|
||||
|
||||
# denoising part
|
||||
self.denoising_class_embed = nn.Embedding(nc, hidden_dim)
|
||||
self.num_denoising = num_denoising
|
||||
self.denoising_class_embed = nn.Embedding(nc, hd)
|
||||
self.num_denoising = nd
|
||||
self.label_noise_ratio = label_noise_ratio
|
||||
self.box_noise_scale = box_noise_scale
|
||||
|
||||
# decoder embedding
|
||||
self.learnt_init_query = learnt_init_query
|
||||
if learnt_init_query:
|
||||
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
|
||||
self.tgt_embed = nn.Embedding(nq, hd)
|
||||
self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2)
|
||||
|
||||
# encoder head
|
||||
self.enc_output = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim))
|
||||
self.enc_score_head = nn.Linear(hidden_dim, nc)
|
||||
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd))
|
||||
self.enc_score_head = nn.Linear(hd, nc)
|
||||
self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3)
|
||||
|
||||
# decoder head
|
||||
self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, nc) for _ in range(num_decoder_layers)])
|
||||
self.dec_bbox_head = nn.ModuleList([
|
||||
MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers)])
|
||||
self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)])
|
||||
self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)])
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def forward(self, feats, gt_meta=None):
|
||||
def forward(self, x, batch=None):
|
||||
from ultralytics.vit.utils.ops import get_cdn_group
|
||||
|
||||
# input projection and embedding
|
||||
memory, spatial_shapes, _ = self._get_encoder_input(feats)
|
||||
feats, shapes = self._get_encoder_input(x)
|
||||
|
||||
# prepare denoising training
|
||||
if self.training:
|
||||
raise NotImplementedError
|
||||
# denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
|
||||
# get_contrastive_denoising_training_group(gt_meta,
|
||||
# self.num_classes,
|
||||
# self.num_queries,
|
||||
# self.denoising_class_embed.weight,
|
||||
# self.num_denoising,
|
||||
# self.label_noise_ratio,
|
||||
# self.box_noise_scale)
|
||||
else:
|
||||
denoising_class, denoising_bbox_unact, attn_mask = None, None, None
|
||||
dn_embed, dn_bbox, attn_mask, dn_meta = \
|
||||
get_cdn_group(batch,
|
||||
self.nc,
|
||||
self.num_queries,
|
||||
self.denoising_class_embed.weight,
|
||||
self.num_denoising,
|
||||
self.label_noise_ratio,
|
||||
self.box_noise_scale,
|
||||
self.training)
|
||||
|
||||
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
|
||||
self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact)
|
||||
embed, refer_bbox, enc_bboxes, enc_scores = \
|
||||
self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
|
||||
|
||||
# decoder
|
||||
out_bboxes, out_logits = self.decoder(target,
|
||||
init_ref_points_unact,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
dec_bboxes, dec_scores = self.decoder(embed,
|
||||
refer_bbox,
|
||||
feats,
|
||||
shapes,
|
||||
self.dec_bbox_head,
|
||||
self.dec_score_head,
|
||||
self.query_pos_head,
|
||||
attn_mask=attn_mask)
|
||||
if not self.training:
|
||||
out_logits = out_logits.sigmoid_()
|
||||
return out_bboxes, out_logits # enc_topk_bboxes, enc_topk_logits, dn_meta
|
||||
dec_scores = dec_scores.sigmoid_()
|
||||
return dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
|
||||
|
||||
def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
|
||||
anchors = []
|
||||
for i, (h, w) in enumerate(shapes):
|
||||
grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=dtype, device=device),
|
||||
torch.arange(end=w, dtype=dtype, device=device),
|
||||
indexing='ij')
|
||||
grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2)
|
||||
|
||||
valid_WH = torch.tensor([h, w], dtype=dtype, device=device)
|
||||
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2)
|
||||
wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
|
||||
anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4)
|
||||
|
||||
anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4)
|
||||
valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1
|
||||
anchors = torch.log(anchors / (1 - anchors))
|
||||
anchors = torch.where(valid_mask, anchors, torch.inf)
|
||||
return anchors, valid_mask
|
||||
|
||||
def _get_encoder_input(self, x):
|
||||
# get projection features
|
||||
x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
|
||||
# get encoder inputs
|
||||
feats = []
|
||||
shapes = []
|
||||
for feat in x:
|
||||
h, w = feat.shape[2:]
|
||||
# [b, c, h, w] -> [b, h*w, c]
|
||||
feats.append(feat.flatten(2).permute(0, 2, 1))
|
||||
# [nl, 2]
|
||||
shapes.append([h, w])
|
||||
|
||||
# [b, h*w, c]
|
||||
feats = torch.cat(feats, 1)
|
||||
return feats, shapes
|
||||
|
||||
def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
|
||||
bs = len(feats)
|
||||
# prepare input for decoder
|
||||
anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
|
||||
features = self.enc_output(torch.where(valid_mask, feats, 0)) # bs, h*w, 256
|
||||
|
||||
enc_outputs_scores = self.enc_score_head(features) # (bs, h*w, nc)
|
||||
# dynamic anchors + static content
|
||||
enc_outputs_bboxes = self.enc_bbox_head(features) + anchors # (bs, h*w, 4)
|
||||
|
||||
# query selection
|
||||
# (bs, num_queries)
|
||||
topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1)
|
||||
# (bs, num_queries)
|
||||
batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
|
||||
|
||||
# Unsigmoided
|
||||
refer_bbox = enc_outputs_bboxes[batch_ind, topk_ind].view(bs, self.num_queries, -1)
|
||||
# refer_bbox = torch.gather(enc_outputs_bboxes, 1, topk_ind.reshape(bs, self.num_queries).unsqueeze(-1).repeat(1, 1, 4))
|
||||
|
||||
enc_bboxes = refer_bbox.sigmoid()
|
||||
if dn_bbox is not None:
|
||||
refer_bbox = torch.cat([dn_bbox, refer_bbox], 1)
|
||||
if self.training:
|
||||
refer_bbox = refer_bbox.detach()
|
||||
enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1)
|
||||
|
||||
if self.learnt_init_query:
|
||||
embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
|
||||
else:
|
||||
embeddings = features[batch_ind, topk_ind].view(bs, self.num_queries, -1)
|
||||
if self.training:
|
||||
embeddings = embeddings.detach()
|
||||
if dn_embed is not None:
|
||||
embeddings = torch.cat([dn_embed, embeddings], 1)
|
||||
|
||||
return embeddings, refer_bbox, enc_bboxes, enc_scores
|
||||
|
||||
# TODO
|
||||
def _reset_parameters(self):
|
||||
# class and bbox head init
|
||||
bias_cls = bias_init_with_prob(0.01)
|
||||
linear_init_(self.enc_score_head)
|
||||
bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
|
||||
# NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
|
||||
# linear_init_(self.enc_score_head)
|
||||
constant_(self.enc_score_head.bias, bias_cls)
|
||||
constant_(self.enc_bbox_head.layers[-1].weight, 0.)
|
||||
constant_(self.enc_bbox_head.layers[-1].bias, 0.)
|
||||
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
|
||||
linear_init_(cls_)
|
||||
# linear_init_(cls_)
|
||||
constant_(cls_.bias, bias_cls)
|
||||
constant_(reg_.layers[-1].weight, 0.)
|
||||
constant_(reg_.layers[-1].bias, 0.)
|
||||
@ -280,103 +347,3 @@ class RTDETRDecoder(nn.Module):
|
||||
xavier_uniform_(self.query_pos_head.layers[1].weight)
|
||||
for layer in self.input_proj:
|
||||
xavier_uniform_(layer[0].weight)
|
||||
|
||||
def _build_input_proj_layer(self, ch):
|
||||
self.input_proj = nn.ModuleList()
|
||||
for in_channels in ch:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(nn.Conv2d(in_channels, self.hidden_dim, kernel_size=1, bias=False),
|
||||
nn.BatchNorm2d(self.hidden_dim)))
|
||||
in_channels = ch[-1]
|
||||
for _ in range(self.nl - len(ch)):
|
||||
self.input_proj.append(
|
||||
nn.Sequential(nn.Conv2D(in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias=False),
|
||||
nn.BatchNorm2d(self.hidden_dim)))
|
||||
in_channels = self.hidden_dim
|
||||
|
||||
def _generate_anchors(self, spatial_shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
|
||||
anchors = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=torch.float32),
|
||||
torch.arange(end=w, dtype=torch.float32),
|
||||
indexing='ij')
|
||||
grid_xy = torch.stack([grid_x, grid_y], -1)
|
||||
|
||||
valid_WH = torch.tensor([h, w]).to(torch.float32)
|
||||
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
|
||||
wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
|
||||
anchors.append(torch.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
|
||||
|
||||
anchors = torch.concat(anchors, 1)
|
||||
valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
|
||||
anchors = torch.log(anchors / (1 - anchors))
|
||||
anchors = torch.where(valid_mask, anchors, torch.inf)
|
||||
return anchors.to(device=device, dtype=dtype), valid_mask.to(device=device)
|
||||
|
||||
def _get_encoder_input(self, feats):
|
||||
# get projection features
|
||||
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
||||
if self.nl > len(proj_feats):
|
||||
len_srcs = len(proj_feats)
|
||||
for i in range(len_srcs, self.nl):
|
||||
if i == len_srcs:
|
||||
proj_feats.append(self.input_proj[i](feats[-1]))
|
||||
else:
|
||||
proj_feats.append(self.input_proj[i](proj_feats[-1]))
|
||||
|
||||
# get encoder inputs
|
||||
feat_flatten = []
|
||||
spatial_shapes = []
|
||||
level_start_index = [0]
|
||||
for feat in proj_feats:
|
||||
_, _, h, w = feat.shape
|
||||
# [b, c, h, w] -> [b, h*w, c]
|
||||
feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
|
||||
# [nl, 2]
|
||||
spatial_shapes.append([h, w])
|
||||
# [l], start index of each level
|
||||
level_start_index.append(h * w + level_start_index[-1])
|
||||
|
||||
# [b, l, c]
|
||||
feat_flatten = torch.concat(feat_flatten, 1)
|
||||
level_start_index.pop()
|
||||
return feat_flatten, spatial_shapes, level_start_index
|
||||
|
||||
def _get_decoder_input(self, memory, spatial_shapes, denoising_class=None, denoising_bbox_unact=None):
|
||||
bs, _, _ = memory.shape
|
||||
# prepare input for decoder
|
||||
anchors, valid_mask = self._generate_anchors(spatial_shapes, dtype=memory.dtype, device=memory.device)
|
||||
memory = torch.where(valid_mask, memory, 0)
|
||||
output_memory = self.enc_output(memory)
|
||||
|
||||
enc_outputs_class = self.enc_score_head(output_memory) # (bs, h*w, nc)
|
||||
enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors # (bs, h*w, 4)
|
||||
|
||||
# (bs, topk)
|
||||
_, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1)
|
||||
# extract region proposal boxes
|
||||
# (bs, topk_ind)
|
||||
batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
|
||||
topk_ind = topk_ind.view(-1)
|
||||
|
||||
# Unsigmoided
|
||||
reference_points_unact = enc_outputs_coord_unact[batch_ind, topk_ind].view(bs, self.num_queries, -1)
|
||||
|
||||
enc_topk_bboxes = torch.sigmoid(reference_points_unact)
|
||||
if denoising_bbox_unact is not None:
|
||||
reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
|
||||
if self.training:
|
||||
reference_points_unact = reference_points_unact.detach()
|
||||
enc_topk_logits = enc_outputs_class[batch_ind, topk_ind].view(bs, self.num_queries, -1)
|
||||
|
||||
# extract region features
|
||||
if self.learnt_init_query:
|
||||
target = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
|
||||
else:
|
||||
target = output_memory[batch_ind, topk_ind].view(bs, self.num_queries, -1)
|
||||
if self.training:
|
||||
target = target.detach()
|
||||
if denoising_class is not None:
|
||||
target = torch.concat([denoising_class, target], 1)
|
||||
|
||||
return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
|
||||
|
@ -229,23 +229,23 @@ class MSDeformAttn(nn.Module):
|
||||
xavier_uniform_(self.output_proj.weight.data)
|
||||
constant_(self.output_proj.bias.data, 0.)
|
||||
|
||||
def forward(self, query, reference_points, value, value_spatial_shapes, value_mask=None):
|
||||
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
|
||||
"""
|
||||
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
||||
Args:
|
||||
query (Tensor): [bs, query_length, C]
|
||||
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
|
||||
query (torch.Tensor): [bs, query_length, C]
|
||||
refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
|
||||
bottom-right (1, 1), including padding area
|
||||
value (Tensor): [bs, value_length, C]
|
||||
value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
|
||||
value (torch.Tensor): [bs, value_length, C]
|
||||
value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
|
||||
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
|
||||
|
||||
Returns:
|
||||
output (Tensor): [bs, Length_{query}, C]
|
||||
"""
|
||||
bs, len_q = query.shape[:2]
|
||||
_, len_v = value.shape[:2]
|
||||
assert sum(s[0] * s[1] for s in value_spatial_shapes) == len_v
|
||||
len_v = value.shape[1]
|
||||
assert sum(s[0] * s[1] for s in value_shapes) == len_v
|
||||
|
||||
value = self.value_proj(value)
|
||||
if value_mask is not None:
|
||||
@ -255,18 +255,17 @@ class MSDeformAttn(nn.Module):
|
||||
attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
|
||||
attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
|
||||
# N, Len_q, n_heads, n_levels, n_points, 2
|
||||
n = reference_points.shape[-1]
|
||||
if n == 2:
|
||||
offset_normalizer = torch.as_tensor(value_spatial_shapes, dtype=query.dtype, device=query.device).flip(-1)
|
||||
num_points = refer_bbox.shape[-1]
|
||||
if num_points == 2:
|
||||
offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
|
||||
add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
|
||||
sampling_locations = reference_points[:, :, None, :, None, :] + add
|
||||
|
||||
elif n == 4:
|
||||
add = sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
|
||||
sampling_locations = reference_points[:, :, None, :, None, :2] + add
|
||||
sampling_locations = refer_bbox[:, :, None, :, None, :] + add
|
||||
elif num_points == 4:
|
||||
add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
|
||||
sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
|
||||
else:
|
||||
raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {n}.')
|
||||
output = multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights)
|
||||
raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
|
||||
output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
|
||||
output = self.output_proj(output)
|
||||
return output
|
||||
|
||||
@ -308,33 +307,24 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
||||
tgt = self.norm3(tgt)
|
||||
return tgt
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
src,
|
||||
src_spatial_shapes,
|
||||
src_padding_mask=None,
|
||||
attn_mask=None,
|
||||
query_pos=None):
|
||||
def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
|
||||
# self attention
|
||||
q = k = self.with_pos_embed(tgt, query_pos)
|
||||
if attn_mask is not None:
|
||||
attn_mask = torch.where(attn_mask.astype('bool'), torch.zeros(attn_mask.shape, tgt.dtype),
|
||||
torch.full(attn_mask.shape, float('-inf'), tgt.dtype))
|
||||
tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
q = k = self.with_pos_embed(embed, query_pos)
|
||||
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
|
||||
attn_mask=attn_mask)[0].transpose(0, 1)
|
||||
embed = embed + self.dropout1(tgt)
|
||||
embed = self.norm1(embed)
|
||||
|
||||
# cross attention
|
||||
tgt2 = self.cross_attn(self.with_pos_embed(tgt, query_pos), reference_points, src, src_spatial_shapes,
|
||||
src_padding_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
|
||||
padding_mask)
|
||||
embed = embed + self.dropout2(tgt)
|
||||
embed = self.norm2(embed)
|
||||
|
||||
# ffn
|
||||
tgt = self.forward_ffn(tgt)
|
||||
embed = self.forward_ffn(embed)
|
||||
|
||||
return tgt
|
||||
return embed
|
||||
|
||||
|
||||
class DeformableTransformerDecoder(nn.Module):
|
||||
@ -349,41 +339,40 @@ class DeformableTransformerDecoder(nn.Module):
|
||||
self.hidden_dim = hidden_dim
|
||||
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
src,
|
||||
src_spatial_shapes,
|
||||
bbox_head,
|
||||
score_head,
|
||||
query_pos_head,
|
||||
attn_mask=None,
|
||||
src_padding_mask=None):
|
||||
output = tgt
|
||||
dec_out_bboxes = []
|
||||
dec_out_logits = []
|
||||
ref_points = None
|
||||
ref_points_detach = torch.sigmoid(reference_points)
|
||||
def forward(
|
||||
self,
|
||||
embed, # decoder embeddings
|
||||
refer_bbox, # anchor
|
||||
feats, # image features
|
||||
shapes, # feature shapes
|
||||
bbox_head,
|
||||
score_head,
|
||||
pos_mlp,
|
||||
attn_mask=None,
|
||||
padding_mask=None):
|
||||
output = embed
|
||||
dec_bboxes = []
|
||||
dec_cls = []
|
||||
last_refined_bbox = None
|
||||
refer_bbox = refer_bbox.sigmoid()
|
||||
for i, layer in enumerate(self.layers):
|
||||
ref_points_input = ref_points_detach.unsqueeze(2)
|
||||
query_pos_embed = query_pos_head(ref_points_detach)
|
||||
output = layer(output, ref_points_input, src, src_spatial_shapes, src_padding_mask, attn_mask,
|
||||
query_pos_embed)
|
||||
output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))
|
||||
|
||||
inter_ref_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
|
||||
# refine bboxes, (bs, num_queries+num_denoising, 4)
|
||||
refined_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(refer_bbox))
|
||||
|
||||
if self.training:
|
||||
dec_out_logits.append(score_head[i](output))
|
||||
dec_cls.append(score_head[i](output))
|
||||
if i == 0:
|
||||
dec_out_bboxes.append(inter_ref_bbox)
|
||||
dec_bboxes.append(refined_bbox)
|
||||
else:
|
||||
dec_out_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
|
||||
dec_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(last_refined_bbox)))
|
||||
elif i == self.eval_idx:
|
||||
dec_out_logits.append(score_head[i](output))
|
||||
dec_out_bboxes.append(inter_ref_bbox)
|
||||
dec_cls.append(score_head[i](output))
|
||||
dec_bboxes.append(refined_bbox)
|
||||
break
|
||||
|
||||
ref_points = inter_ref_bbox
|
||||
ref_points_detach = inter_ref_bbox.detach() if self.training else inter_ref_bbox
|
||||
last_refined_bbox = refined_bbox
|
||||
refer_bbox = refined_bbox.detach() if self.training else refined_bbox
|
||||
|
||||
return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
|
||||
return torch.stack(dec_bboxes), torch.stack(dec_cls)
|
||||
|
Reference in New Issue
Block a user