1. 前言
为了保证模型输出的精度,地平线的实例分割参考算法deeplabv3+在部署时,将模型尾部的resize和argmax放在了CPU的后处理代码中进行计算,但是这样也带来了一些问题,比如会占用比较大的CPU资源,尤其是大尺寸特征图的argmax计算。事实上,地平线J5的BPU也是支持resize和argmax算子的,本帖将介绍将resize和argmax计算融入到deeplabv3+模型中进行BPU加速的教程。
2. 代码修改流程
2.1 环境部署
首先访问地平线开发者社区的OpenExplorer算法工具链版本发布帖,获取GPU docker和OE开发包,然后参考用户手册环境部署章节完成环境的部署。
2.2 代码修改流程
Step1:-
在Docker环境下,进入到/usr/loacal/lib/python3.8/dist-packages/hat/models/task_modules/deeplab/head.py目录下,修改Deeplabv3plusHead类的代码,如下所示:
def __init__(
self,
in_channels: int,
c1_index: int,
c1_in_channels: int,
feat_channels: int,
num_classes: int,
dilations: List[int],
num_repeats: List[int],
argmax_output: Optional[bool] = False,
dequant_output: Optional[bool] = True,
int8_output: Optional[bool] = True,
bn_kwargs: Optional[Dict] = None,
dropout_ratio: Optional[float] = 0.1,
upsample_output_scale: Optional[int] = None,
upsample_decode_scale: Optional[int] = 4,
bias=True,
):
...
#定义resize2算子,
#将Deeplabv3plusHead输出上采样8倍至1024x2048
self.resize2 = hnn.Interpolate(
#resize到原图尺寸
scale_factor=8,
align_corners=None,
recompute_scale_factor=True,
)
def forward(self, inputs):
x = inputs[-1]
c1_input = inputs[self.c1_index]
x = self.aspp(x)
x = self.seg_convs([c1_input, x])
if self.dropout is not None:
x = self.dropout(x)
seg_pred1 = self.cls_seg(x)
#增加argmax输出分支
seg_pred = self.resize2(seg_pred1)
seg_pred = seg_pred.argmax(dim=1,keepdim=True)
seg_pred1 = self.dequant(seg_pred1)
return seg_pred,seg_pred1
Step2:-
在Docker环境下,进入到/usr/loacal/lib/python3.8/dist-packages/hat/models/structures/encoder_decoder.py 目录下,修改_post_process函数的代码,如下所示:
@fx_wrap()
def _post_process(self, gts, preds, features):
if self.training:
#改动1:
target = self.target(gts, preds[1])
decode_loss = self.loss(**target)
aux_losses = OrderedDict()
if self.auxiliary_heads is not None:
features = [
i.as_subclass(torch.Tensor)
if isinstance(i, QTensor)
else i
for i in features
]
for head, target_mod, loss in zip(
self.auxiliary_heads,
self.auxiliary_target,
self.auxiliary_loss,
):
preds = head(features)
target = target_mod(gts, preds)
aux_loss = loss(**target)
aux_losses.update(aux_loss)
return {**decode_loss, **aux_losses}
else:
#改动2:
#因为已经在self.decode_head里做了resize和argmax,
#所以注释掉self.decode
# if self.decode is not None:
# preds = self.decode(preds)
preds=preds[0]
if self.with_target is False:
return preds
return preds, target
def forward(self, data: dict):
image = data["img"]
gts = data.get("gt_seg", None)
features = self.backbone(image)
if self.neck is not None:
features = self.neck(features)
preds= self.decode_head(features)
print("decode_head out in enc-dec:",preds[0].shape,preds[1].shape)
return self._post_process(gts, preds, features)
Step3:-
进入到OE开发包的ddk/samples/ai_toolchain/horizon_model_train_sample/scripts/configs/segmentation目录下,-
修改config文件中的model和deploy_model字段,如下所示:
model = dict(
type="EncoderDecoder",
backbone=dict(
type="efficientnet",
bn_kwargs=bn_kwargs,
model_type="b0",
num_classes=1000,
include_top=False,
activation="relu",
use_se_block=False,
blocks_args=SEG_BLOCKS_ARGS,
),
decode_head=dict(
type="Deeplabv3plusHead",
in_channels=320,
feat_channels=128,
num_classes=19,
c1_index=2,
dilations=[1, 2, 4, 4],
num_repeats=[1, 1, 1, 2],
c1_in_channels=40,
bn_kwargs=bn_kwargs,
argmax_output=False,
dequant_output=True,
#设置conv输出为int8
#部署模型尾部由conv改为了resize和argmax,conv不再支持int32高精度输出
int8_output=True,
dropout_ratio=0.1,
upsample_decode_scale=4,
upsample_output_scale=None,
),
...
)
deploy_model = dict(
type="EncoderDecoder",
backbone=dict(
type="efficientnet",
bn_kwargs=bn_kwargs,
model_type="b0",
num_classes=1000,
include_top=False,
activation="relu",
use_se_block=False,
blocks_args=SEG_BLOCKS_ARGS,
),
decode_head=dict(
type="Deeplabv3plusHead",
in_channels=320,
feat_channels=128,
num_classes=19,
c1_index=2,
dilations=[1, 2, 4, 4],
num_repeats=[1, 1, 1, 2],
c1_in_channels=40,
bn_kwargs=bn_kwargs,
argmax_output=False,
dequant_output=True,
#修改conv输出为int8
int8_output=True,
dropout_ratio=0.1,
upsample_decode_scale=4,
upsample_output_scale=None,
),
)
Step4:-
修改config文件中的update_val_metric函数:
def update_val_metric(metrics, batch, model_outs):
# Convert one hot to index
target: Tensor = batch["gt_seg"]
preds= model_outs
#resize&argmax后的模型输出shape:1x1x1024x2048,
#target shape:1x1024x2048
#所以这里增加了unsqueeze操作
for metric in metrics:
metric.update(target.unsqueeze(1), preds)
至此,代码修改完成。-
然后,我们按照地平线的算法工具链用户手册完成deeplabv3+参考算法的训练和编译后,生成的板端部署hbm模型中的resize和argmax算子就可以实现在BPU上的加速了。