MobileViT 网络测试
MobileViT是一种结合了CNN和Transformer的轻量级视觉模型,能够实现端侧实时性,并适用于分类、检测和分割任务。其核心思想是通过将CNN和Transformer结合,充分利用了CNN的特征提取能力和Transformer的长距离依赖建模能力。MobileViT在ImageNet-1k数据集上表现出色,其参数量与推理速度在不同模型大小(如XXS、XS、S)下均优于MobileNet和ViT。例如,3.5M参数的MobileNetV2在ImageNet-1k上的Top-1准确率为73.3%,而MobileViT在相同参数量下Top-1准确率达到74.8%。此外,MobileViT在检测和分割任务中也表现出色,如在COCO目标检测任务中,MobileViT在参数量相近的情况下,比MobileNetv3的准确率高5.7%。尽管其推理速度稍慢于ViT,但其轻量性和高效性使其成为移动端部署的理想选择。
mobilevit-pytorch/mobilevit/mobilevit.py at master · chinhsuanwu/mobilevit-pytorch · GitHub
MobileFormer 与上面的不太一样:
https://github.com/slwang9353/MobileFormer
实时Transformer类轻量型MobileViT
本文首次报道了CNN+Transformer的端侧实时应用,并不仅限于分类任务,同时还将检测与分割纳入考虑范畴,实为近期少有的佳作之一。然而,其推理速度相较于传统的CNN有一定差距。例如,参数量为3.5M的MobileNetV2推理速度仅为0.92ms,精度达73.3%;而参数量为2.3M的MobileViT的推理速度则为7.28ms,精度略高至74.8%。遗憾的是,MobileViT尚未公开其源代码,因此无法亲自体验其高性能与高推理速度。
实验结果表明,MobileViT在多样的任务场景和数据集上明显超越了基于CNN和ViT的网络。
基于ImageNet-1k数据集,MobileViT在参数数量约为600万个时,表现出了78.4%的Top-1准确率。相较于MobileNetv3和DeiT,该模型在保持相同参数数量的情况下,准确率分别高出3.2%和6.2%。
有预训练的:
GitHub - wilile26811249/MobileViT: An unofficial PyTorch implementation of MobileViT, developed based on the paper "MobileViT: Lightweight, versatile, and mobile-friendly Vision Transformer".
模型43M:
| MobileViT | ImageNet-1k | 0.05 | Cosine LR | SGDM | 1e-5 | 61.918% | 83.05% |
|---|
网友实现的:
https://github.com/Xingyu-Romantic/MobileViT_Openpose
MobileViT在参数数量相近的条件下,在MS-COCO目标检测任务中的准确率数值优于MobileNetv3,其准确率数值高出5.7个百分点。
模型大小:
MobileViT s 19m
MobileViT xs 9.21m
MobileViT xxs 5.19m
Apple推出全新MobileViT视觉Transformer,兼具轻量化与移动端适配性
import torch
import torch.nn as nn
from typing import Callable, Any, Optional, List
from einops import rearrange
import torch
import torch.nn as nn
class ConvNormAct(nn.Module):
def __init__(self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride = 1,
padding: Optional[int] = None,
groups: int = 1,
norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.SiLU,
dilation: int = 1
):
super(ConvNormAct, self).__init__()
if padding is None:
padding = (kernel_size - 1) // 2 * dilation
self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding,
dilation = dilation, groups = groups, bias = norm_layer is None)
self.norm_layer = nn.BatchNorm2d(out_channels) if norm_layer is None else norm_layer(out_channels)
self.act = activation_layer() if activation_layer is not None else activation_layer
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.conv(x)
if self.norm_layer is not None:
x = self.norm_layer(x)
if self.act is not None:
x = self.act(x)
return x
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = nn.LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
class FFN(nn.Module):
def __init__(self, dim, hidden_dim, dropout=0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.SiLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class MultiHeadSelfAttention(nn.Module):
"""
Implement multi head self attention layer using the "Einstein summation convention".
Paper: https://arxiv.org/abs/1706.03762
Blog: https://towardsdatascience.com/illustrated-self-attention-2d627e33b20a
Parameters
----------
dim:
Token's dimension, EX: word embedding vector size
num_heads:
The number of distinct representations to learn
dim_head:
The dimension of the each head
"""
def __init__(self, dim, num_heads = 8, dim_head = None):
super(MultiHeadSelfAttention, self).__init__()
self.num_heads = num_heads
self.dim_head = int(dim / num_heads) if dim_head is None else dim_head
_weight_dim = self.num_heads * self.dim_head
self.to_qvk = nn.Linear(dim, _weight_dim * 3, bias = False)
self.scale_factor = dim ** -0.5
self.scale_factor = dim ** -0.5
# Weight matrix for output, Size: num_heads*dim_head X dim
# Final linear transformation layer
self.w_out = nn.Linear(_weight_dim, dim, bias = False)
def forward(self, x):
qkv = self.to_qvk(x).chunk(3, dim = -1)
q, k, v = map(lambda t: rearrange(t, 'b p n (h d) -> b p h n d', h = self.num_heads), qkv)
dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale_factor
attn = torch.softmax(dots, dim = -1)
out = torch.matmul(attn, v)
out = rearrange(out, 'b p h n d -> b p n (h d)')
return self.w_out(out)
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.1):
super().__init__()
self.layers = nn.ModuleList([])
for _ in range(depth):
self.layers.append(nn.ModuleList([
PreNorm(dim, MultiHeadSelfAttention(dim, heads, dim_head)),
PreNorm(dim, FFN(dim, mlp_dim, dropout))
]))
def forward(self, x):
for attn, ff in self.layers:
x = attn(x) + x
x = ff(x) + x
return x
class InvertedResidual(nn.Module):
"""
MobileNetv2 InvertedResidual block
"""
def __init__(self, in_channels, out_channels, stride = 1, expand_ratio = 2, act_layer = nn.SiLU):
super(InvertedResidual, self).__init__()
self.stride = stride
self.use_res_connect = self.stride == 1 and in_channels == out_channels
hidden_dim = int(round(in_channels * expand_ratio))
layers = []
if expand_ratio != 1:
layers.append(ConvNormAct(in_channels, hidden_dim, kernel_size = 1, activation_layer = None))
# Depth-wise convolution
layers.append(
ConvNormAct(hidden_dim, hidden_dim, kernel_size = 3, stride = stride,
padding = 1, groups = hidden_dim, activation_layer = act_layer)
)
# Point-wise convolution
layers.append(
nn.Conv2d(hidden_dim, out_channels, kernel_size = 1, stride = 1, bias = False)
)
layers.append(nn.BatchNorm2d(out_channels))
self.conv = nn.Sequential(*layers)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileVitBlock(nn.Module):
def __init__(self, in_channels, out_channels, d_model, layers, mlp_dim):
super(MobileVitBlock, self).__init__()
# Local representation
self.local_representation = nn.Sequential(
# Encode local spatial information
ConvNormAct(in_channels, in_channels, 3),
# Projects the tensor to a high-diementional space
ConvNormAct(in_channels, d_model, 1)
)
self.transformer = Transformer(d_model, layers, 1, 32, mlp_dim, 0.1)
# Fusion block
self.fusion_block1 = nn.Conv2d(d_model, in_channels, kernel_size = 1)
self.fusion_block2 = nn.Conv2d(in_channels * 2, out_channels, 3, padding = 1)
def forward(self, x):
local_repr = self.local_representation(x)
# global_repr = self.global_representation(local_repr)
_, _, h, w = local_repr.shape
global_repr = rearrange(local_repr, 'b d (h ph) (w pw) -> b (ph pw) (h w) d', ph=2, pw=2)
global_repr = self.transformer(global_repr)
global_repr = rearrange(global_repr, 'b (ph pw) (h w) d -> b d (h ph) (w pw)', h=h//2, w=w//2, ph=2, pw=2)
# Fuse the local and gloval features in the concatenation tensor
fuse_repr = self.fusion_block1(global_repr)
result = self.fusion_block2(torch.cat([x, fuse_repr], dim = 1))
return result
model_cfg = {
"xxs":{
"features": [16, 16, 24, 24, 48, 48, 64, 64, 80, 80, 320],
"d": [64, 80, 96],
"expansion_ratio": 2,
"layers": [2, 4, 3]
},
"xs":{
"features": [16, 32, 48, 48, 64, 64, 80, 80, 96, 96, 384],
"d": [96, 120, 144],
"expansion_ratio": 4,
"layers": [2, 4, 3]
},
"s":{
"features": [16, 32, 64, 64, 96, 96, 128, 128, 160, 160, 640],
"d": [144, 192, 240],
"expansion_ratio": 4,
"layers": [2, 4, 3]
},
}
class MobileViT(nn.Module):
def __init__(self, img_size, features_list, d_list, transformer_depth, expansion, num_classes = 1000):
super(MobileViT, self).__init__()
self.stem = nn.Sequential(
nn.Conv2d(in_channels = 3, out_channels = features_list[0], kernel_size = 3, stride = 2, padding = 1),
InvertedResidual(in_channels = features_list[0], out_channels = features_list[1], stride = 1, expand_ratio = expansion),
)
self.stage1 = nn.Sequential(
InvertedResidual(in_channels = features_list[1], out_channels = features_list[2], stride = 2, expand_ratio = expansion),
InvertedResidual(in_channels = features_list[2], out_channels = features_list[2], stride = 1, expand_ratio = expansion),
InvertedResidual(in_channels = features_list[2], out_channels = features_list[3], stride = 1, expand_ratio = expansion)
)
self.stage2 = nn.Sequential(
InvertedResidual(in_channels = features_list[3], out_channels = features_list[4], stride = 2, expand_ratio = expansion),
MobileVitBlock(in_channels = features_list[4], out_channels = features_list[5], d_model = d_list[0],
layers = transformer_depth[0], mlp_dim = d_list[0] * 2)
)
self.stage3 = nn.Sequential(
InvertedResidual(in_channels = features_list[5], out_channels = features_list[6], stride = 2, expand_ratio = expansion),
MobileVitBlock(in_channels = features_list[6], out_channels = features_list[7], d_model = d_list[1],
layers = transformer_depth[1], mlp_dim = d_list[1] * 4)
)
self.stage4 = nn.Sequential(
InvertedResidual(in_channels = features_list[7], out_channels = features_list[8], stride = 2, expand_ratio = expansion),
MobileVitBlock(in_channels = features_list[8], out_channels = features_list[9], d_model = d_list[2],
layers = transformer_depth[2], mlp_dim = d_list[2] * 4),
nn.Conv2d(in_channels = features_list[9], out_channels = features_list[10], kernel_size = 1, stride = 1, padding = 0)
)
self.avgpool = nn.AvgPool2d(kernel_size = img_size // 32)
self.fc = nn.Linear(features_list[10], num_classes)
def forward(self, x):
# Stem
x = self.stem(x)
# Body
x = self.stage1(x)
x = self.stage2(x)
x = self.stage3(x)
x = self.stage4(x)
# Head
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def MobileViT_XXS(img_size = 256, num_classes = 1000):
cfg_xxs = model_cfg["xxs"]
model_xxs = MobileViT(img_size, cfg_xxs["features"], cfg_xxs["d"], cfg_xxs["layers"], cfg_xxs["expansion_ratio"], num_classes)
return model_xxs
def MobileViT_XS(img_size = 256, num_classes = 1000):
cfg_xs = model_cfg["xs"]
model_xs = MobileViT(img_size, cfg_xs["features"], cfg_xs["d"], cfg_xs["layers"], cfg_xs["expansion_ratio"], num_classes)
return model_xs
def MobileViT_S(img_size = 256, num_classes = 1000):
cfg_s = model_cfg["s"]
model_s = MobileViT(img_size, cfg_s["features"], cfg_s["d"], cfg_s["layers"], cfg_s["expansion_ratio"], num_classes)
return model_s
if __name__ == "__main__":
img = torch.randn(1, 3, 256, 256)
cfg_xxs = model_cfg["xxs"]
model_xxs = MobileViT(256, cfg_xxs["features"], cfg_xxs["d"], cfg_xxs["layers"], cfg_xxs["expansion_ratio"])
cfg_xs = model_cfg["xs"]
model_xs = MobileViT(256, cfg_xs["features"], cfg_xs["d"], cfg_xs["layers"], cfg_xs["expansion_ratio"])
cfg_s = model_cfg["s"]
model_s = MobileViT(256, cfg_s["features"], cfg_s["d"], cfg_s["layers"], cfg_s["expansion_ratio"])
model_s.eval()
model_path = "dicenet.pth"
torch.save(model_s.state_dict(), model_path)
import os
import time
fsize = os.path.getsize(model_path)
fsize = fsize / float(1024 * 1024)
print(f"model size {round(fsize, 2)} m")
# XXS: 1.3M 、 XS: 2.3M 、 S: 5.6M
print("XXS params: ",round( sum(p.numel() for p in model_xxs.parameters())/(1024*1024),2))
print(" XS params: ", round(sum(p.numel() for p in model_xs.parameters())/(1024*1024),2))
print(" S params: ",round( sum(p.numel() for p in model_s.parameters())/(1024*1024),2))
