YOLOv5改进系列(20)——添加BiFormer注意力机制(CVPR2023|小目标涨点神器)
![]()
🚀一、BiFormer介绍
- 论文题目:《BiFormer: Vision Transformer with Bi-Level Routing Attention》
- 论文地址:https://arxiv.org/pdf/2303.08810.pdf
- 源码地址:https://github.com/rayleizhu/BiFormer
1.1 简介
BiFomer模块
提出了
一种动态的稀疏注意力机制
。首先在宏观上
过滤掉大部分不相关的 key 与 value
,只保留下一小部分相关的 key 和 value。然后
在这一小部分相关 key 与 value 中使用 token -to-token 注意力
。
1.2 模型结构

具体结构:
- 第一阶段使用重叠的patch embedding(i=1时)嵌入(降低输入空间分辨率)
- 第二到第四阶段使用patch merging(i=2,3,4时)模块(增加通道数量)
- 然后使用Ni个相连的BiFormer块来对输入特征进行transformer操作
🚀二、具体添加方法
第①步:在common.py中添加BiFormer模块
将以下代码复制粘贴到common.py文件的末尾
- # ------------------代码转自@迪菲赫尔曼------------------------------
- from torch import Tensor
- from typing import Tuple
- import torch.nn.functional as F
- from einops import rearrange
- class TopkRouting(nn.Module):
- """
- differentiable topk routing with scaling
- Args:
- qk_dim: int, feature dimension of query and key
- topk: int, the 'topk'
- qk_scale: int or None, temperature (multiply) of softmax activation
- with_param: bool, wether inorporate learnable params in routing unit
- diff_routing: bool, wether make routing differentiable
- soft_routing: bool, wether make output value multiplied by routing weights
- """
- def __init__(self, qk_dim, topk=4, qk_scale=None, param_routing=False, diff_routing=False):
- super().__init__()
- self.topk = topk
- self.qk_dim = qk_dim
- self.scale = qk_scale or qk_dim ** -0.5
- self.diff_routing = diff_routing
- self.emb = nn.Linear(qk_dim, qk_dim) if param_routing else nn.Identity()
- self.routing_act = nn.Softmax(dim=-1)
- def forward(self, query: Tensor, key: Tensor) -> Tuple[Tensor]:
- if not self.diff_routing:
- query, key = query.detach(), key.detach()
- query_hat, key_hat = self.emb(query), self.emb(key)
- attn_logit = (query_hat * self.scale) @ key_hat.transpose(-2, -1)
- topk_attn_logit, topk_index = torch.topk(attn_logit, k=self.topk, dim=-1)
- r_weight = self.routing_act(topk_attn_logit)
- return r_weight, topk_index
- class KVGather(nn.Module):
- def __init__(self, mul_weight='none'):
- super().__init__()
- assert mul_weight in ['none', 'soft', 'hard']
- self.mul_weight = mul_weight
- def forward(self, r_idx: Tensor, r_weight: Tensor, kv: Tensor):
- n, p2, w2, c_kv = kv.size()
- topk = r_idx.size(-1)
- topk_kv = torch.gather(kv.view(n, 1, p2, w2, c_kv).expand(-1, p2, -1, -1, -1),
- dim=2,
- index=r_idx.view(n, p2, topk, 1, 1).expand(-1, -1, -1, w2, c_kv)
- )
- if self.mul_weight == 'soft':
- topk_kv = r_weight.view(n, p2, topk, 1, 1) * topk_kv
- elif self.mul_weight == 'hard':
- raise NotImplementedError('differentiable hard routing TBA')
- return topk_kv
- class QKVLinear(nn.Module):
- def __init__(self, dim, qk_dim, bias=True):
- super().__init__()
- self.dim = dim
- self.qk_dim = qk_dim
- self.qkv = nn.Linear(dim, qk_dim + qk_dim + dim, bias=bias)
- def forward(self, x):
- q, kv = self.qkv(x).split([self.qk_dim, self.qk_dim + self.dim], dim=-1)
- return q, kv
- class BiLevelRoutingAttention(nn.Module):
- """
- n_win: number of windows in one side (so the actual number of windows is n_win*n_win)
- kv_per_win: for kv_downsample_mode='ada_xxxpool' only, number of key/values per window. Similar to n_win, the actual number is kv_per_win*kv_per_win.
- topk: topk for window filtering
- param_attention: 'qkvo'-linear for q,k,v and o, 'none': param free attention
- param_routing: extra linear for routing
- diff_routing: wether to set routing differentiable
- soft_routing: wether to multiply soft routing weights
- """
- def __init__(self, dim, n_win=7, num_heads=8, qk_dim=None, qk_scale=None,
- kv_per_win=4, kv_downsample_ratio=4, kv_downsample_kernel=None, kv_downsample_mode='identity',
- topk=4, param_attention="qkvo", param_routing=False, diff_routing=False, soft_routing=False,
- side_dwconv=3,
- auto_pad=True):
- super().__init__()
- self.dim = dim
- self.n_win = n_win # Wh, Ww
- self.num_heads = num_heads
- self.qk_dim = qk_dim or dim
- assert self.qk_dim % num_heads == 0 and self.dim % num_heads == 0, 'qk_dim and dim must be divisible by num_heads!'
- self.scale = qk_scale or self.qk_dim ** -0.5
- self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv // 2,
- groups=dim) if side_dwconv > 0 else \
- lambda x: torch.zeros_like(x)
- self.topk = topk
- self.param_routing = param_routing
- self.diff_routing = diff_routing
- self.soft_routing = soft_routing
- # router
- assert not (self.param_routing and not self.diff_routing)
- self.router = TopkRouting(qk_dim=self.qk_dim,
- qk_scale=self.scale,
- topk=self.topk,
- diff_routing=self.diff_routing,
- param_routing=self.param_routing)
- if self.soft_routing: # soft routing, always diffrentiable (if no detach)
- mul_weight = 'soft'
- elif self.diff_routing: # hard differentiable routing
- mul_weight = 'hard'
- else: # hard non-differentiable routing
- mul_weight = 'none'
- self.kv_gather = KVGather(mul_weight=mul_weight)
- # qkv mapping (shared by both global routing and local attention)
- self.param_attention = param_attention
- if self.param_attention == 'qkvo':
- self.qkv = QKVLinear(self.dim, self.qk_dim)
- self.wo = nn.Linear(dim, dim)
- elif self.param_attention == 'qkv':
- self.qkv = QKVLinear(self.dim, self.qk_dim)
- self.wo = nn.Identity()
- else:
- raise ValueError(f'param_attention mode {self.param_attention} is not surpported!')
- self.kv_downsample_mode = kv_downsample_mode
- self.kv_per_win = kv_per_win
- self.kv_downsample_ratio = kv_downsample_ratio
- self.kv_downsample_kenel = kv_downsample_kernel
- if self.kv_downsample_mode == 'ada_avgpool':
- assert self.kv_per_win is not None
- self.kv_down = nn.AdaptiveAvgPool2d(self.kv_per_win)
- elif self.kv_downsample_mode == 'ada_maxpool':
- assert self.kv_per_win is not None
- self.kv_down = nn.AdaptiveMaxPool2d(self.kv_per_win)
- elif self.kv_downsample_mode == 'maxpool':
- assert self.kv_downsample_ratio is not None
- self.kv_down = nn.MaxPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity()
- elif self.kv_downsample_mode == 'avgpool':
- assert self.kv_downsample_ratio is not None
- self.kv_down = nn.AvgPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity()
- elif self.kv_downsample_mode == 'identity': # no kv downsampling
- self.kv_down = nn.Identity()
- elif self.kv_downsample_mode == 'fracpool':
- raise NotImplementedError('fracpool policy is not implemented yet!')
- elif kv_downsample_mode == 'conv':
- raise NotImplementedError('conv policy is not implemented yet!')
- else:
- raise ValueError(f'kv_down_sample_mode {self.kv_downsaple_mode} is not surpported!')
- self.attn_act = nn.Softmax(dim=-1)
- self.auto_pad = auto_pad
- def forward(self, x, ret_attn_mask=False):
- """
- x: NHWC tensor
- Return:
- NHWC tensor
- """
- x = rearrange(x, "n c h w -> n h w c")
- if self.auto_pad:
- N, H_in, W_in, C = x.size()
- pad_l = pad_t = 0
- pad_r = (self.n_win - W_in % self.n_win) % self.n_win
- pad_b = (self.n_win - H_in % self.n_win) % self.n_win
- x = F.pad(x, (0, 0, # dim=-1
- pad_l, pad_r, # dim=-2
- pad_t, pad_b)) # dim=-3
- _, H, W, _ = x.size() # padded size
- else:
- N, H, W, C = x.size()
- assert H % self.n_win == 0 and W % self.n_win == 0 #
- x = rearrange(x, "n (j h) (i w) c -> n (j i) h w c", j=self.n_win, i=self.n_win)
- q, kv = self.qkv(x)
- q_pix = rearrange(q, 'n p2 h w c -> n p2 (h w) c')
- kv_pix = self.kv_down(rearrange(kv, 'n p2 h w c -> (n p2) c h w'))
- kv_pix = rearrange(kv_pix, '(n j i) c h w -> n (j i) (h w) c', j=self.n_win, i=self.n_win)
- q_win, k_win = q.mean([2, 3]), kv[..., 0:self.qk_dim].mean(
- [2, 3]) # window-wise qk, (n, p^2, c_qk), (n, p^2, c_qk)
- lepe = self.lepe(rearrange(kv[..., self.qk_dim:], 'n (j i) h w c -> n c (j h) (i w)', j=self.n_win,
- i=self.n_win).contiguous())
- lepe = rearrange(lepe, 'n c (j h) (i w) -> n (j h) (i w) c', j=self.n_win, i=self.n_win)
- r_weight, r_idx = self.router(q_win, k_win) # both are (n, p^2, topk) tensors
- kv_pix_sel = self.kv_gather(r_idx=r_idx, r_weight=r_weight, kv=kv_pix) # (n, p^2, topk, h_kv*w_kv, c_qk+c_v)
- k_pix_sel, v_pix_sel = kv_pix_sel.split([self.qk_dim, self.dim], dim=-1)
- k_pix_sel = rearrange(k_pix_sel, 'n p2 k w2 (m c) -> (n p2) m c (k w2)',
- m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_kq//m) transpose here?
- v_pix_sel = rearrange(v_pix_sel, 'n p2 k w2 (m c) -> (n p2) m (k w2) c',
- m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_v//m)
- q_pix = rearrange(q_pix, 'n p2 w2 (m c) -> (n p2) m w2 c',
- m=self.num_heads) # to BMLC tensor (n*p^2, m, w^2, c_qk//m)
- attn_weight = (
- q_pix * self.scale) @ k_pix_sel # (n*p^2, m, w^2, c) @ (n*p^2, m, c, topk*h_kv*w_kv) -> (n*p^2, m, w^2, topk*h_kv*w_kv)
- attn_weight = self.attn_act(attn_weight)
- out = attn_weight @ v_pix_sel # (n*p^2, m, w^2, topk*h_kv*w_kv) @ (n*p^2, m, topk*h_kv*w_kv, c) -> (n*p^2, m, w^2, c)
- out = rearrange(out, '(n j i) m (h w) c -> n (j h) (i w) (m c)', j=self.n_win, i=self.n_win,
- h=H // self.n_win, w=W // self.n_win)
- out = out + lepe
- out = self.wo(out)
- if self.auto_pad and (pad_r > 0 or pad_b > 0):
- out = out[:, :H_in, :W_in, :].contiguous()
- if ret_attn_mask:
- return out, r_weight, r_idx, attn_weight
- else:
- return rearrange(out, "n h w c -> n c h w")
之前没有导入einops包的看这篇→
如下图所示:
第②步:修改yolo.py文件
再来修改yolo.py,在parse_model函数中找到 elif m is Concat: 语句,在其后面加上下面代码:
- elif m in [BiLevelRoutingAttention]:
- c2 = ch[f]
- args = [c2, *args[0:]]
如下图所示:
第③步:创建自定义的yaml文件
yaml文件配置完整代码如下:
- # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
- # Parameters
- nc: 80 # number of classes
- depth_multiple: 0.33 # model depth multiple
- width_multiple: 0.50 # layer channel multiple
- anchors:
- - [10,13, 16,30, 33,23] # P3/8
- - [30,61, 62,45, 59,119] # P4/16
- - [116,90, 156,198, 373,326] # P5/32
- # YOLOv5 v6.0 backbone
- backbone:
- # [from, number, module, args]
- [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]], # 1-P2/4
- [-1, 3, C3, [128]],
- [-1, 1, Conv, [256, 3, 2]], # 3-P3/8
- [-1, 6, C3, [256]],
- [-1, 1, Conv, [512, 3, 2]], # 5-P4/16
- [-1, 9, C3, [512]],
- [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
- [-1, 3, C3, [1024]],
- [-1, 1, SPPF, [1024, 5]], # 9
- ]
- # YOLOv5 v6.0 head
- head:
- [[-1, 1, Conv, [512, 1, 1]],
- [-1, 1, nn.Upsample, [None, 2, 'nearest']],
- [[-1, 6], 1, Concat, [1]], # cat backbone P4
- [-1, 3, C3, [512, False]], # 13
- [-1, 1, Conv, [256, 1, 1]],
- [-1, 1, nn.Upsample, [None, 2, 'nearest']],
- [[-1, 4], 1, Concat, [1]], # cat backbone P3
- [-1, 3, C3, [256, False]], # 17 (P3/8-small)
- [-1, 1, BiLevelRoutingAttention, []],
- [-1, 1, Conv, [256, 3, 2]],
- [[-1, 14], 1, Concat, [1]], # cat head P4
- [-1, 3, C3, [512, False]], # 20 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]],
- [[-1, 10], 1, Concat, [1]], # cat head P5
- [-1, 3, C3, [1024, False]], # 23 (P5/32-large)
- [[17, 21, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
- ]
第④步:验证是否加入成功
运行yolo.py
这样就OK啦!
PS:
(1)这个注意力机制在我同学的数据集上高效涨点,我加入后试了不同的位置,在小目标层相比原始yolov5s涨了0.2,其余位置都掉点了。可以证明的确是对小目标检测更加友好。
引用原文一句话:
“由于 BiFormer 以查询自适应的方式关注一小部分相关标记,而不会分散其他不相关标记的注意力,因此它具有良好的性能和高计算效率。”
(2)通过不止一个人的实验发现,加入这个模块后没有增加太多参数,但电脑就莫名其妙跑不动了。在云端跑还是OK的。