Add RTDETR Trainer (#2745)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: Laughing-q <1185102784@qq.com>
Co-authored-by: Kayzwer <68285002+Kayzwer@users.noreply.github.com>
Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com>
This commit is contained in:
Ayush Chaurasia
2023-06-17 17:16:18 +05:30
committed by GitHub
parent 03bce07848
commit a0ba8ef5f0
23 changed files with 989 additions and 314 deletions

View File

@ -163,111 +163,178 @@ class RTDETRDecoder(nn.Module):
self,
nc=80,
ch=(512, 1024, 2048),
hidden_dim=256,
num_queries=300,
strides=(8, 16, 32), # TODO
nl=3,
num_decoder_points=4,
nhead=8,
num_decoder_layers=6,
dim_feedforward=1024,
hd=256, # hidden dim
nq=300, # num queries
ndp=4, # num decoder points
nh=8, # num head
ndl=6, # num decoder layers
d_ffn=1024, # dim of feedforward
dropout=0.,
act=nn.ReLU(),
eval_idx=-1,
# training args
num_denoising=100,
nd=100, # num denoising
label_noise_ratio=0.5,
box_noise_scale=1.0,
learnt_init_query=False):
super().__init__()
assert len(ch) <= nl
assert len(strides) == len(ch)
for _ in range(nl - len(strides)):
strides.append(strides[-1] * 2)
self.hidden_dim = hidden_dim
self.nhead = nhead
self.feat_strides = strides
self.nl = nl
self.hidden_dim = hd
self.nhead = nh
self.nl = len(ch) # num level
self.nc = nc
self.num_queries = num_queries
self.num_decoder_layers = num_decoder_layers
self.num_queries = nq
self.num_decoder_layers = ndl
# backbone feature projection
self._build_input_proj_layer(ch)
self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch)
# NOTE: simplified version but it's not consistent with .pt weights.
# self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch)
# Transformer module
decoder_layer = DeformableTransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, act, nl,
num_decoder_points)
self.decoder = DeformableTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx)
decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp)
self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx)
# denoising part
self.denoising_class_embed = nn.Embedding(nc, hidden_dim)
self.num_denoising = num_denoising
self.denoising_class_embed = nn.Embedding(nc, hd)
self.num_denoising = nd
self.label_noise_ratio = label_noise_ratio
self.box_noise_scale = box_noise_scale
# decoder embedding
self.learnt_init_query = learnt_init_query
if learnt_init_query:
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
self.tgt_embed = nn.Embedding(nq, hd)
self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2)
# encoder head
self.enc_output = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim))
self.enc_score_head = nn.Linear(hidden_dim, nc)
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd))
self.enc_score_head = nn.Linear(hd, nc)
self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3)
# decoder head
self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, nc) for _ in range(num_decoder_layers)])
self.dec_bbox_head = nn.ModuleList([
MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers)])
self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)])
self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)])
self._reset_parameters()
def forward(self, feats, gt_meta=None):
def forward(self, x, batch=None):
from ultralytics.vit.utils.ops import get_cdn_group
# input projection and embedding
memory, spatial_shapes, _ = self._get_encoder_input(feats)
feats, shapes = self._get_encoder_input(x)
# prepare denoising training
if self.training:
raise NotImplementedError
# denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
# get_contrastive_denoising_training_group(gt_meta,
# self.num_classes,
# self.num_queries,
# self.denoising_class_embed.weight,
# self.num_denoising,
# self.label_noise_ratio,
# self.box_noise_scale)
else:
denoising_class, denoising_bbox_unact, attn_mask = None, None, None
dn_embed, dn_bbox, attn_mask, dn_meta = \
get_cdn_group(batch,
self.nc,
self.num_queries,
self.denoising_class_embed.weight,
self.num_denoising,
self.label_noise_ratio,
self.box_noise_scale,
self.training)
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact)
embed, refer_bbox, enc_bboxes, enc_scores = \
self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
# decoder
out_bboxes, out_logits = self.decoder(target,
init_ref_points_unact,
memory,
spatial_shapes,
dec_bboxes, dec_scores = self.decoder(embed,
refer_bbox,
feats,
shapes,
self.dec_bbox_head,
self.dec_score_head,
self.query_pos_head,
attn_mask=attn_mask)
if not self.training:
out_logits = out_logits.sigmoid_()
return out_bboxes, out_logits # enc_topk_bboxes, enc_topk_logits, dn_meta
dec_scores = dec_scores.sigmoid_()
return dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
anchors = []
for i, (h, w) in enumerate(shapes):
grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=dtype, device=device),
torch.arange(end=w, dtype=dtype, device=device),
indexing='ij')
grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2)
valid_WH = torch.tensor([h, w], dtype=dtype, device=device)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2)
wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4)
anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4)
valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1
anchors = torch.log(anchors / (1 - anchors))
anchors = torch.where(valid_mask, anchors, torch.inf)
return anchors, valid_mask
def _get_encoder_input(self, x):
# get projection features
x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
# get encoder inputs
feats = []
shapes = []
for feat in x:
h, w = feat.shape[2:]
# [b, c, h, w] -> [b, h*w, c]
feats.append(feat.flatten(2).permute(0, 2, 1))
# [nl, 2]
shapes.append([h, w])
# [b, h*w, c]
feats = torch.cat(feats, 1)
return feats, shapes
def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
bs = len(feats)
# prepare input for decoder
anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
features = self.enc_output(torch.where(valid_mask, feats, 0)) # bs, h*w, 256
enc_outputs_scores = self.enc_score_head(features) # (bs, h*w, nc)
# dynamic anchors + static content
enc_outputs_bboxes = self.enc_bbox_head(features) + anchors # (bs, h*w, 4)
# query selection
# (bs, num_queries)
topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1)
# (bs, num_queries)
batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
# Unsigmoided
refer_bbox = enc_outputs_bboxes[batch_ind, topk_ind].view(bs, self.num_queries, -1)
# refer_bbox = torch.gather(enc_outputs_bboxes, 1, topk_ind.reshape(bs, self.num_queries).unsqueeze(-1).repeat(1, 1, 4))
enc_bboxes = refer_bbox.sigmoid()
if dn_bbox is not None:
refer_bbox = torch.cat([dn_bbox, refer_bbox], 1)
if self.training:
refer_bbox = refer_bbox.detach()
enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1)
if self.learnt_init_query:
embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
else:
embeddings = features[batch_ind, topk_ind].view(bs, self.num_queries, -1)
if self.training:
embeddings = embeddings.detach()
if dn_embed is not None:
embeddings = torch.cat([dn_embed, embeddings], 1)
return embeddings, refer_bbox, enc_bboxes, enc_scores
# TODO
def _reset_parameters(self):
# class and bbox head init
bias_cls = bias_init_with_prob(0.01)
linear_init_(self.enc_score_head)
bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
# NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
# linear_init_(self.enc_score_head)
constant_(self.enc_score_head.bias, bias_cls)
constant_(self.enc_bbox_head.layers[-1].weight, 0.)
constant_(self.enc_bbox_head.layers[-1].bias, 0.)
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
linear_init_(cls_)
# linear_init_(cls_)
constant_(cls_.bias, bias_cls)
constant_(reg_.layers[-1].weight, 0.)
constant_(reg_.layers[-1].bias, 0.)
@ -280,103 +347,3 @@ class RTDETRDecoder(nn.Module):
xavier_uniform_(self.query_pos_head.layers[1].weight)
for layer in self.input_proj:
xavier_uniform_(layer[0].weight)
def _build_input_proj_layer(self, ch):
self.input_proj = nn.ModuleList()
for in_channels in ch:
self.input_proj.append(
nn.Sequential(nn.Conv2d(in_channels, self.hidden_dim, kernel_size=1, bias=False),
nn.BatchNorm2d(self.hidden_dim)))
in_channels = ch[-1]
for _ in range(self.nl - len(ch)):
self.input_proj.append(
nn.Sequential(nn.Conv2D(in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(self.hidden_dim)))
in_channels = self.hidden_dim
def _generate_anchors(self, spatial_shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
anchors = []
for lvl, (h, w) in enumerate(spatial_shapes):
grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=torch.float32),
torch.arange(end=w, dtype=torch.float32),
indexing='ij')
grid_xy = torch.stack([grid_x, grid_y], -1)
valid_WH = torch.tensor([h, w]).to(torch.float32)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
anchors.append(torch.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
anchors = torch.concat(anchors, 1)
valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
anchors = torch.log(anchors / (1 - anchors))
anchors = torch.where(valid_mask, anchors, torch.inf)
return anchors.to(device=device, dtype=dtype), valid_mask.to(device=device)
def _get_encoder_input(self, feats):
# get projection features
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
if self.nl > len(proj_feats):
len_srcs = len(proj_feats)
for i in range(len_srcs, self.nl):
if i == len_srcs:
proj_feats.append(self.input_proj[i](feats[-1]))
else:
proj_feats.append(self.input_proj[i](proj_feats[-1]))
# get encoder inputs
feat_flatten = []
spatial_shapes = []
level_start_index = [0]
for feat in proj_feats:
_, _, h, w = feat.shape
# [b, c, h, w] -> [b, h*w, c]
feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
# [nl, 2]
spatial_shapes.append([h, w])
# [l], start index of each level
level_start_index.append(h * w + level_start_index[-1])
# [b, l, c]
feat_flatten = torch.concat(feat_flatten, 1)
level_start_index.pop()
return feat_flatten, spatial_shapes, level_start_index
def _get_decoder_input(self, memory, spatial_shapes, denoising_class=None, denoising_bbox_unact=None):
bs, _, _ = memory.shape
# prepare input for decoder
anchors, valid_mask = self._generate_anchors(spatial_shapes, dtype=memory.dtype, device=memory.device)
memory = torch.where(valid_mask, memory, 0)
output_memory = self.enc_output(memory)
enc_outputs_class = self.enc_score_head(output_memory) # (bs, h*w, nc)
enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors # (bs, h*w, 4)
# (bs, topk)
_, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1)
# extract region proposal boxes
# (bs, topk_ind)
batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
topk_ind = topk_ind.view(-1)
# Unsigmoided
reference_points_unact = enc_outputs_coord_unact[batch_ind, topk_ind].view(bs, self.num_queries, -1)
enc_topk_bboxes = torch.sigmoid(reference_points_unact)
if denoising_bbox_unact is not None:
reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
if self.training:
reference_points_unact = reference_points_unact.detach()
enc_topk_logits = enc_outputs_class[batch_ind, topk_ind].view(bs, self.num_queries, -1)
# extract region features
if self.learnt_init_query:
target = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
else:
target = output_memory[batch_ind, topk_ind].view(bs, self.num_queries, -1)
if self.training:
target = target.detach()
if denoising_class is not None:
target = torch.concat([denoising_class, target], 1)
return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits

View File

@ -229,23 +229,23 @@ class MSDeformAttn(nn.Module):
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, query, reference_points, value, value_spatial_shapes, value_mask=None):
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
"""
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
Args:
query (Tensor): [bs, query_length, C]
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
query (torch.Tensor): [bs, query_length, C]
refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area
value (Tensor): [bs, value_length, C]
value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
value (torch.Tensor): [bs, value_length, C]
value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
Returns:
output (Tensor): [bs, Length_{query}, C]
"""
bs, len_q = query.shape[:2]
_, len_v = value.shape[:2]
assert sum(s[0] * s[1] for s in value_spatial_shapes) == len_v
len_v = value.shape[1]
assert sum(s[0] * s[1] for s in value_shapes) == len_v
value = self.value_proj(value)
if value_mask is not None:
@ -255,18 +255,17 @@ class MSDeformAttn(nn.Module):
attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
# N, Len_q, n_heads, n_levels, n_points, 2
n = reference_points.shape[-1]
if n == 2:
offset_normalizer = torch.as_tensor(value_spatial_shapes, dtype=query.dtype, device=query.device).flip(-1)
num_points = refer_bbox.shape[-1]
if num_points == 2:
offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
sampling_locations = reference_points[:, :, None, :, None, :] + add
elif n == 4:
add = sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
sampling_locations = reference_points[:, :, None, :, None, :2] + add
sampling_locations = refer_bbox[:, :, None, :, None, :] + add
elif num_points == 4:
add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
else:
raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {n}.')
output = multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights)
raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
output = self.output_proj(output)
return output
@ -308,33 +307,24 @@ class DeformableTransformerDecoderLayer(nn.Module):
tgt = self.norm3(tgt)
return tgt
def forward(self,
tgt,
reference_points,
src,
src_spatial_shapes,
src_padding_mask=None,
attn_mask=None,
query_pos=None):
def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
# self attention
q = k = self.with_pos_embed(tgt, query_pos)
if attn_mask is not None:
attn_mask = torch.where(attn_mask.astype('bool'), torch.zeros(attn_mask.shape, tgt.dtype),
torch.full(attn_mask.shape, float('-inf'), tgt.dtype))
tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
q = k = self.with_pos_embed(embed, query_pos)
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
attn_mask=attn_mask)[0].transpose(0, 1)
embed = embed + self.dropout1(tgt)
embed = self.norm1(embed)
# cross attention
tgt2 = self.cross_attn(self.with_pos_embed(tgt, query_pos), reference_points, src, src_spatial_shapes,
src_padding_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
padding_mask)
embed = embed + self.dropout2(tgt)
embed = self.norm2(embed)
# ffn
tgt = self.forward_ffn(tgt)
embed = self.forward_ffn(embed)
return tgt
return embed
class DeformableTransformerDecoder(nn.Module):
@ -349,41 +339,40 @@ class DeformableTransformerDecoder(nn.Module):
self.hidden_dim = hidden_dim
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
def forward(self,
tgt,
reference_points,
src,
src_spatial_shapes,
bbox_head,
score_head,
query_pos_head,
attn_mask=None,
src_padding_mask=None):
output = tgt
dec_out_bboxes = []
dec_out_logits = []
ref_points = None
ref_points_detach = torch.sigmoid(reference_points)
def forward(
self,
embed, # decoder embeddings
refer_bbox, # anchor
feats, # image features
shapes, # feature shapes
bbox_head,
score_head,
pos_mlp,
attn_mask=None,
padding_mask=None):
output = embed
dec_bboxes = []
dec_cls = []
last_refined_bbox = None
refer_bbox = refer_bbox.sigmoid()
for i, layer in enumerate(self.layers):
ref_points_input = ref_points_detach.unsqueeze(2)
query_pos_embed = query_pos_head(ref_points_detach)
output = layer(output, ref_points_input, src, src_spatial_shapes, src_padding_mask, attn_mask,
query_pos_embed)
output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))
inter_ref_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
# refine bboxes, (bs, num_queries+num_denoising, 4)
refined_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(refer_bbox))
if self.training:
dec_out_logits.append(score_head[i](output))
dec_cls.append(score_head[i](output))
if i == 0:
dec_out_bboxes.append(inter_ref_bbox)
dec_bboxes.append(refined_bbox)
else:
dec_out_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
dec_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(last_refined_bbox)))
elif i == self.eval_idx:
dec_out_logits.append(score_head[i](output))
dec_out_bboxes.append(inter_ref_bbox)
dec_cls.append(score_head[i](output))
dec_bboxes.append(refined_bbox)
break
ref_points = inter_ref_bbox
ref_points_detach = inter_ref_bbox.detach() if self.training else inter_ref_bbox
last_refined_bbox = refined_bbox
refer_bbox = refined_bbox.detach() if self.training else refined_bbox
return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
return torch.stack(dec_bboxes), torch.stack(dec_cls)