RT-DETR改进策略【Neck】| 2023 显式视觉中心EVC 优化特征提取金字塔,对密集预测任务非常有效
一、本文介绍
本文记录的是
利用
显式视觉中心EVC
优化RT-DETR的目标检测网络模型
。利用
EVC
改进颈部网络,通过
轻量级MLP
和
可学习视觉中心机制LVC
能够同时
捕获全局长程依赖
和
保留局部角落区域信息
,
在结构简单、体积轻便的同时,提高密集预测任务检测性能。
二、EVC介绍
Centralized Feature Pyramid for Object Detection
2.1 出发点
- 现有的方法大多集中在层间特征交互,忽略了层内特征规则,且一些利用注意力机制或视觉变换器学习层内特征表示的方法忽略了对密集预测任务重要的角落区域。为了解决这些问题,设计了ECV模块。
2.2 原理
- 一方面通过轻量级MLP架构捕获顶层特征 x 4 x_{4} x 4 的全局长程依赖(全局信息),另一方面通过可学习的视觉中心机制聚合输入图像的局部关键区域(局部信息),然后将这两部分结果沿通道维度拼接作为ECV的输出用于下游识别。
2.3 结构
2.3.1 轻量级MLP部分
- 主要由两个残差模块组成,一个是基于深度可分离卷积的模块,另一个是基于通道MLP的模块。输入的特征先经过一个 7 × 7 7×7 7 × 7 卷积(输出通道大小为256)、批量归一化层和激活函数层组成的Stem块处理得到 X i n X_{in} X in 。对于基于深度可分离卷积的模块, X i n X_{in} X in 先进入深度可分离卷积层,经过组归一化处理,然后进行通道缩放和DropPath操作,再加上 X i n X_{in} X in 的残差连接得到 X ~ i n \tilde{X}_{in} X ~ in 。对于基于通道MLP的模块, X ~ i n \tilde{X}_{in} X ~ in 先进行组归一化,然后实施通道MLP,接着进行通道缩放、DropPath操作以及 X ~ i n \tilde{X}_{in} X ~ in 的残差连接得到 M L P ( X i n ) MLP(X_{in}) M L P ( X in ) 。
2.3.2 可学习视觉中心机制部分(LVC)
- 具有一个固有字典,包括固有码本 B = { b 1 , b 2 , ⋯ , b K } B = \{b_{1},b_{2},\cdots,b_{K}\} B = { b 1 , b 2 , ⋯ , b K } 和一组缩放因子 S = { s 1 , s 2 , ⋯ , s K } S=\{s_{1},s_{2},\cdots,s_{K}\} S = { s 1 , s 2 , ⋯ , s K } 。特征从Stem块 x i n x_{in} x in 先经过一组卷积层( 1 × 1 1×1 1 × 1 卷积、 3 × 3 3×3 3 × 3 卷积和 1 × 1 1×1 1 × 1 卷积)编码,然后经过一个由 3 × 3 3×3 3 × 3 卷积、BN层和ReLU激活函数组成的CBR块处理,进入码本。通过缩放因子 s s s 计算得到关于第 k k k 个码字的信息 e k e_{k} e k ,然后用 ϕ \phi ϕ 融合所有 e k e_{k} e k 得到关于 K K K 个码字的全信息(e)。 e e e 经过一个全连接层和 1 × 1 1×1 1 × 1 卷积层预测突出关键类的特征,再与输入特征 X i n X_{in} X in 进行通道乘法和通道加法操作得到 L V C ( X i n ) LVC(X_{in}) L V C ( X in ) 。
2.4 优势
- 信息全面 :能够同时捕获全局长程依赖和保留局部角落区域信息,这对于密集预测任务非常重要。
- 结构优势 :轻量级MLP结构简单、体积更轻且计算效率更高,相比基于多头注意力机制的变换器编码器有优势。
论文: https://arxiv.org/pdf/2210.02093
源码: https://github.com/QY1994-0919/CFPNet
三、EVC的实现代码
EVC模块
的实现代码如下:
import torch
import torch.nn as nn
from timm.models.layers import DropPath, trunc_normal_
from functools import partial
import torch.nn.functional as F
def autopad(k, p=None, d=1): # kernel, padding, dilation
"""Pad to 'same' shape outputs."""
if d > 1:
k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
class Conv(nn.Module):
"""Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
"""Initialize Conv layer with given arguments including activation."""
super().__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
"""Apply convolution, batch normalization and activation to input tensor."""
return self.act(self.bn(self.conv(x)))
def forward_fuse(self, x):
"""Perform transposed convolution of 2D data."""
return self.act(self.conv(x))
class Encoding(nn.Module):
def __init__(self, in_channels, num_codes):
super(Encoding, self).__init__()
# init codewords and smoothing factor
self.in_channels, self.num_codes = in_channels, num_codes
num_codes = 64
std = 1. / ((num_codes * in_channels) ** 0.5)
# [num_codes, channels]
self.codewords = nn.Parameter(
torch.empty(num_codes, in_channels, dtype=torch.float).uniform_(-std, std), requires_grad=True)
# [num_codes]
self.scale = nn.Parameter(torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0), requires_grad=True)
@staticmethod
def scaled_l2(x, codewords, scale):
num_codes, in_channels = codewords.size()
b = x.size(0)
expanded_x = x.unsqueeze(2).expand((b, x.size(1), num_codes, in_channels))
# ---处理codebook (num_code, c1)
reshaped_codewords = codewords.view((1, 1, num_codes, in_channels))
# 把scale从1, num_code变成 batch, c2, N, num_codes
reshaped_scale = scale.view((1, 1, num_codes)) # N, num_codes
# ---计算rik = z1 - d # b, N, num_codes
scaled_l2_norm = reshaped_scale * (expanded_x - reshaped_codewords).pow(2).sum(dim=3)
return scaled_l2_norm
@staticmethod
def aggregate(assignment_weights, x, codewords):
num_codes, in_channels = codewords.size()
# ---处理codebook
reshaped_codewords = codewords.view((1, 1, num_codes, in_channels))
b = x.size(0)
# ---处理特征向量x b, c1, N
expanded_x = x.unsqueeze(2).expand((b, x.size(1), num_codes, in_channels))
# 变换rei b, N, num_codes,-
assignment_weights = assignment_weights.unsqueeze(3) # b, N, num_codes,
# ---开始计算eik,必须在Rei计算完之后
encoded_feat = (assignment_weights * (expanded_x - reshaped_codewords)).sum(1)
return encoded_feat
def forward(self, x):
assert x.dim() == 4 and x.size(1) == self.in_channels
b, in_channels, w, h = x.size()
# [batch_size, height x width, channels]
x = x.view(b, self.in_channels, -1).transpose(1, 2).contiguous()
# assignment_weights: [batch_size, channels, num_codes]
assignment_weights = torch.softmax(self.scaled_l2(x, self.codewords, self.scale), dim=2)
# aggregate
encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
return encoded_feat
class Mlp(nn.Module):
"""
Implementation of MLP with 1*1 convolutions. Input: tensor with shape [B, C, H, W]
"""
def __init__(self, in_features, hidden_features=None,
out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
self.act = act_layer()
self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
self.drop = nn.Dropout(drop)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Conv2d):
trunc_normal_(m.weight, std=.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
# 1*1 3*3 1*1
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, res_conv=False, act_layer=nn.SiLU, groups=1,
norm_layer=partial(nn.BatchNorm2d, eps=1e-6)):
super(ConvBlock, self).__init__()
self.in_channels = in_channels
expansion = 4
c = out_channels // expansion
self.conv1 = Conv(in_channels, c, act=nn.SiLU())
self.conv2 = Conv(c, c, k=3, s=stride, g=groups, act=nn.SiLU())
self.conv3 = Conv(c, out_channels, 1, act=False)
self.act3 = act_layer(inplace=True)
if res_conv:
self.residual_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False)
self.residual_bn = norm_layer(out_channels)
self.res_conv = res_conv
def zero_init_last_bn(self):
nn.init.zeros_(self.bn3.weight)
def forward(self, x, return_x_2=True):
residual = x
x = self.conv1(x)
x2 = self.conv2(x) # if x_t_r is None else self.conv2(x + x_t_r)
x = self.conv3(x2)
if self.res_conv:
residual = self.residual_conv(residual)
residual = self.residual_bn(residual)
x += residual
x = self.act3(x)
if return_x_2:
return x, x2
else:
return x
class Mean(nn.Module):
def __init__(self, dim, keep_dim=False):
super(Mean, self).__init__()
self.dim = dim
self.keep_dim = keep_dim
def forward(self, input):
return input.mean(self.dim, self.keep_dim)
class LVCBlock(nn.Module):
def __init__(self, in_channels, out_channels, num_codes, channel_ratio=0.25, base_channel=64):
super(LVCBlock, self).__init__()
self.out_channels = out_channels
self.num_codes = num_codes
num_codes = 64
self.conv_1 = ConvBlock(in_channels=in_channels, out_channels=in_channels, res_conv=True, stride=1)
self.LVC = nn.Sequential(
Conv(in_channels, in_channels, 1, act=nn.SiLU()),
Encoding(in_channels=in_channels, num_codes=num_codes),
nn.BatchNorm1d(num_codes),
nn.SiLU(inplace=True),
Mean(dim=1))
self.fc = nn.Sequential(nn.Linear(in_channels, in_channels), nn.Sigmoid())
def forward(self, x):
x = self.conv_1(x, return_x_2=False)
en = self.LVC(x)
gam = self.fc(en)
b, in_channels, _, _ = x.size()
y = gam.view(b, in_channels, 1, 1)
x = F.relu_(x + x * y)
return x
class GroupNorm(nn.GroupNorm):
"""
Group Normalization with 1 group.
Input: tensor in shape [B, C, H, W]
"""
def __init__(self, num_channels, **kwargs):
super().__init__(1, num_channels, **kwargs)
class DWConv_LMLP(nn.Module):
"""Depthwise Conv + Conv"""
def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
super().__init__()
self.dconv = Conv(
in_channels,
in_channels,
k=ksize,
s=stride,
g=in_channels,
)
self.pconv = Conv(
in_channels, out_channels, k=1, s=1, g=1
)
def forward(self, x):
x = self.dconv(x)
return self.pconv(x)
# LightMLPBlock
class LightMLPBlock(nn.Module):
def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu",
mlp_ratio=4., drop=0., act_layer=nn.GELU,
use_layer_scale=True, layer_scale_init_value=1e-5, drop_path=0.,
norm_layer=GroupNorm): # act_layer=nn.GELU,
super().__init__()
self.dw = DWConv_LMLP(in_channels, out_channels, ksize=1, stride=1, act="silu")
self.linear = nn.Linear(out_channels, out_channels) # learnable position embedding
self.out_channels = out_channels
self.norm1 = norm_layer(in_channels)
self.norm2 = norm_layer(in_channels)
mlp_hidden_dim = int(in_channels * mlp_ratio)
self.mlp = Mlp(in_features=in_channels, hidden_features=mlp_hidden_dim, act_layer=nn.GELU,
drop=drop)
self.drop_path = DropPath(drop_path) if drop_path > 0. \
else nn.Identity()
self.use_layer_scale = use_layer_scale
if use_layer_scale:
self.layer_scale_1 = nn.Parameter(
layer_scale_init_value * torch.ones(out_channels), requires_grad=True)
self.layer_scale_2 = nn.Parameter(
layer_scale_init_value * torch.ones(out_channels), requires_grad=True)
def forward(self, x):
if self.use_layer_scale:
x = x + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.dw(self.norm1(x)))
x = x + self.drop_path(self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.dw(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
# EVCBlock
class EVCBlock(nn.Module):
def __init__(self, in_channels, out_channels, channel_ratio=4, base_channel=16):
super().__init__()
expansion = 2
ch = out_channels * expansion
# Stem stage: get the feature maps by conv block (copied form resnet.py) 进入conformer框架之前的处理
self.conv1 = Conv(in_channels, in_channels, k=3, act=nn.SiLU())
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) # 1 / 4 [56, 56]
# LVC
self.lvc = LVCBlock(in_channels=in_channels, out_channels=out_channels, num_codes=64) # c1值暂时未定
# LightMLPBlock
self.l_MLP = LightMLPBlock(in_channels, out_channels, ksize=3, stride=1, act="silu", act_layer=nn.GELU,
mlp_ratio=4., drop=0.,
use_layer_scale=True, layer_scale_init_value=1e-5, drop_path=0.,
norm_layer=GroupNorm)
self.cnv1 = nn.Conv2d(ch, out_channels, kernel_size=1, stride=1, padding=0)
def forward(self, x):
x1 = self.maxpool((self.conv1(x)))
# LVCBlock
x_lvc = self.lvc(x1)
# LightMLPBlock
x_lmlp = self.l_MLP(x1)
# concat
x = torch.cat((x_lvc, x_lmlp), dim=1)
x = self.cnv1(x)
return x
四、添加步骤
4.1 修改一
① 在
ultralytics/nn/
目录下新建
AddModules
文件夹用于存放模块代码
② 在
AddModules
文件夹下新建
EVC.py
,将
第三节
中的代码粘贴到此处
4.2 修改二
在
AddModules
文件夹下新建
__init__.py
(已有则不用新建),在文件内导入模块:
from .EVC import *
4.3 修改三
在
ultralytics/nn/modules/tasks.py
文件中,需要在两处位置添加各模块类名称。
首先:导入模块
其次:在
parse_model函数
中的注册
EVCBlock
模块
五、yaml模型文件
5.1 模型改进版本⭐
此处以
ultralytics/cfg/models/rt-detr/rtdetr-l.yaml
为例,在同目录下创建一个用于自己数据集训练的模型文件
rtdetr-l-EVCBlock.yaml
。
将
rtdetr-l.yaml
中的内容复制到
rtdetr-l-EVCBlock.yaml
文件下,修改
nc
数量等于自己数据中目标的数量。
📌 模型的修改方法是将 颈部网络 进行修改。
# Ultralytics YOLO 🚀, AGPL-3.0 license
# RT-DETR-l object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/rtdetr
# Parameters
nc: 1 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
# [depth, width, max_channels]
l: [1.00, 1.00, 1024]
backbone:
# [from, repeats, module, args]
- [-1, 1, HGStem, [32, 48]] # 0-P2/4
- [-1, 6, HGBlock, [48, 128, 3]] # stage 1
- [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
- [-1, 6, HGBlock, [96, 512, 3]] # stage 2
- [-1, 1, DWConv, [512, 3, 2, 1, False]] # 4-P4/16
- [-1, 6, HGBlock, [192, 1024, 5, True, False]] # cm, c2, k, light, shortcut
- [-1, 6, HGBlock, [192, 1024, 5, True, True]]
- [-1, 6, HGBlock, [192, 1024, 5, True, True]] # stage 3
- [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 8-P5/32
- [-1, 6, HGBlock, [384, 2048, 5, True, False]] # stage 4
head:
- [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 10 input_proj.2
- [-1, 1, AIFI, [1024, 8]]
- [-1, 1, Conv, [256, 1, 1]] # 12, Y5, lateral_convs.0
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [7, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 input_proj.1
- [[-2, -1], 1, Concat, [1]]
- [-1, 3, RepC3, [256]] # 16, fpn_blocks.0
- [-1, 1, Conv, [256, 1, 1]] # 17, Y4, lateral_convs.1
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [3, 1, EVCBlock, [512]] # 19 input_proj.0
- [[-2, -1], 1, Concat, [1]] # cat backbone P4
- [-1, 3, RepC3, [256]] # X3 (21), fpn_blocks.1
- [-1, 1, Conv, [256, 3, 2]] # 22, downsample_convs.0
- [[-1, 17], 1, Concat, [1]] # cat Y4
- [-1, 3, RepC3, [256]] # F4 (24), pan_blocks.0
- [-1, 1, Conv, [256, 3, 2]] # 25, downsample_convs.1
- [[-1, 12], 1, Concat, [1]] # cat Y5
- [-1, 3, RepC3, [256]] # F5 (27), pan_blocks.1
- [[21, 24, 27], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
六、成功运行结果
打印网络模型可以看到颈部网络已经修改完成,并可以进行训练了。
rtdetr-l-EVCBlock :
rtdetr-l-EVCBlock summary: 731 layers, 39,424,899 parameters, 39,424,899 gradients, 185.5 GFLOPs
from n params module arguments
0 -1 1 25248 ultralytics.nn.modules.block.HGStem [3, 32, 48]
1 -1 6 155072 ultralytics.nn.modules.block.HGBlock [48, 48, 128, 3, 6]
2 -1 1 1408 ultralytics.nn.modules.conv.DWConv [128, 128, 3, 2, 1, False]
3 -1 6 839296 ultralytics.nn.modules.block.HGBlock [128, 96, 512, 3, 6]
4 -1 1 5632 ultralytics.nn.modules.conv.DWConv [512, 512, 3, 2, 1, False]
5 -1 6 1695360 ultralytics.nn.modules.block.HGBlock [512, 192, 1024, 5, 6, True, False]
6 -1 6 2055808 ultralytics.nn.modules.block.HGBlock [1024, 192, 1024, 5, 6, True, True]
7 -1 6 2055808 ultralytics.nn.modules.block.HGBlock [1024, 192, 1024, 5, 6, True, True]
8 -1 1 11264 ultralytics.nn.modules.conv.DWConv [1024, 1024, 3, 2, 1, False]
9 -1 6 6708480 ultralytics.nn.modules.block.HGBlock [1024, 384, 2048, 5, 6, True, False]
10 -1 1 524800 ultralytics.nn.modules.conv.Conv [2048, 256, 1, 1, None, 1, 1, False]
11 -1 1 789760 ultralytics.nn.modules.transformer.AIFI [256, 1024, 8]
12 -1 1 66048 ultralytics.nn.modules.conv.Conv [256, 256, 1, 1]
13 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
14 7 1 262656 ultralytics.nn.modules.conv.Conv [1024, 256, 1, 1, None, 1, 1, False]
15 [-2, -1] 1 0 ultralytics.nn.modules.conv.Concat [1]
16 -1 3 2232320 ultralytics.nn.modules.block.RepC3 [512, 256, 3]
17 -1 1 66048 ultralytics.nn.modules.conv.Conv [256, 256, 1, 1]
18 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
19 3 1 6617280 ultralytics.nn.AddModules.EVC.EVCBlock [512, 512]
20 [-2, -1] 1 0 ultralytics.nn.modules.conv.Concat [1]
21 -1 3 2363392 ultralytics.nn.modules.block.RepC3 [768, 256, 3]
22 -1 1 590336 ultralytics.nn.modules.conv.Conv [256, 256, 3, 2]
23 [-1, 17] 1 0 ultralytics.nn.modules.conv.Concat [1]
24 -1 3 2232320 ultralytics.nn.modules.block.RepC3 [512, 256, 3]
25 -1 1 590336 ultralytics.nn.modules.conv.Conv [256, 256, 3, 2]
26 [-1, 12] 1 0 ultralytics.nn.modules.conv.Concat [1]
27 -1 3 2232320 ultralytics.nn.modules.block.RepC3 [512, 256, 3]
28 [21, 24, 27] 1 7303907 ultralytics.nn.modules.head.RTDETRDecoder [1, [256, 256, 256]]
rtdetr-l-EVCBlock summary: 731 layers, 39,424,899 parameters, 39,424,899 gradients, 185.5 GFLOPs