# random colab error: "A UTF-8 locale is required. Got ANSI_X3.4-1968"
# https://github.com/googlecolab/colabtools/issues/3409
import locale
= lambda: "UTF-8" locale.getpreferredencoding
follow up post to pytorch quantization … can we make it faster with GPU and TensorRT
get FasterRCNN with a resnet101 backbone same as previous post …
Code
%%capture
import torch
from torchvision.models.resnet import ResNet, Bottleneck, ResNet101_Weights
from torchvision.models._utils import IntermediateLayerGetter
from torchvision.models.detection.backbone_utils import BackboneWithFPN
from torchvision.models.detection.faster_rcnn import FasterRCNN
def resnet_101():
= ResNet(block=Bottleneck, layers=[3, 4, 23, 3])
resnet =True))
resnet.load_state_dict(ResNet101_Weights.DEFAULT.get_state_dict(progressreturn resnet
= resnet_101()
resnet
# same as before, get intermediate layers and their output dimensions
= [1, 2, 3, 4]
returned_layers = {f"layer{k}": str(v) for v, k in enumerate(returned_layers)}
return_layers = []
in_channels_list for k1, m1 in resnet.named_children():
if 'layer' in k1:
-1].bn3.num_features))
in_channels_list.append((m1[
= FasterRCNN(
rcnn
BackboneWithFPN(=resnet,
backbone=return_layers,
return_layers=in_channels_list,
in_channels_list=256,
out_channels=None,
extra_blocks=None,
norm_layer
),=2
num_classes
)
eval() rcnn.
time the RCNN on both CPU and GPU. I don’t recall what the specs were the last time I used colab to profile the inference time so I’ll document that here as well. I’m using a T4 GPU and the following CPU
# !cat /proc/cpuinfo | grep 'name' | uniq
!lscpu | grep 'name'
Model name: Intel(R) Xeon(R) CPU @ 2.20GHz
!nvidia-smi -L
GPU 0: NVIDIA L4 (UUID: GPU-393b8fe1-1ca8-7aaf-94b9-04eef8e2fda5)
# random image
= torch.rand(3, 200, 200)
image # put on CPU
'cpu'))
rcnn.to(torch.device(= image.to(torch.device('cpu'))
image_cpu
with torch.no_grad():
= %timeit -o rcnn([image_cpu]) cpu_time
1.47 s ± 137 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
from copy import deepcopy
# on GPU
= deepcopy(rcnn).to(torch.device('cuda'))
rcnn_gpu # rcnn.to(torch.device('cuda'))
= image.to(torch.device('cuda'))
image_gpu
with torch.no_grad():
= %timeit -o rcnn_gpu([image_gpu]) gpu_time
37.9 ms ± 235 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
we can also test with half precision…
= rcnn_gpu.half().to(torch.device('cuda'))
rcnn_gpu_half = image_gpu.half()
input_half
with torch.no_grad():
= %timeit -o rcnn_gpu_half([input_half]) gpu_half_time
29.1 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
also re-clock the quantized model using FX Graph Mode since it’s performance is also CPU specific
Code
%%capture
from torch.ao.quantization import quantize_fx
from torch.ao.quantization.qconfig_mapping import get_default_qconfig_mapping
= deepcopy(rcnn)
quant_rcnn
= get_default_qconfig_mapping("fbgemm") # "qnnpack"
qconfig_mapping # assume calibrated already
eval()
quant_rcnn.'cpu'))
quant_rcnn.to(torch.device(# prepare and quantize
= torch.randn(1, 3, 200, 200)
example_input = quantize_fx.prepare_fx(quant_rcnn.backbone, qconfig_mapping, example_input)
quant_rcnn.backbone = quantize_fx.convert_fx(quant_rcnn.backbone)
quant_rcnn.backbone
= torch.jit.script(quant_rcnn)
script_module "./quant_rcnn.pt")
script_module.save(= torch.jit.load("./quant_rcnn.pt", map_location=torch.device('cpu'))
quant_rcnn_jit
#| code-fold: true
import warnings
# warmup
with warnings.catch_warnings():
'ignore')
warnings.simplefilter(for _ in range(3):
= quant_rcnn_jit([image_cpu])
__
with torch.no_grad():
= %timeit -o quant_rcnn_jit([image_cpu]) quant_time
652 ms ± 81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Below I convert the float model to onnx. I went through onnx because that used to be the preferred way of converting to TensorRT. However, the onnx conversion didn’t play well with the trtexec
command line utility for TensorRT regardless of the torch to onnx exporter used. Below the old torch script onnx converter is used but the newer ‘dynamo’ converter also had issues. Thankfully PyTorch has a very easy TensorRT API now, but I keep the ONNX model and evaluate it to see if a simple conversion offers any benefits.
%%capture
!pip install onnx
!pip install onnxruntime
import onnx
torch.onnx.export(
deepcopy(rcnn),# onnx wants a tuple of 2 or bombs: https://github.com/zhiqwang/yolort/issues/485
3, 200, 200)], ),
([torch.randn("rcnn.onnx",
# do_constant_folding=True,
= 11,
opset_version =False
verbose
)# make sure the onnx proto is valid
= onnx.load("rcnn.onnx")
rcnn_onnx onnx.checker.check_model(rcnn_onnx)
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py:4009: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
(torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
/usr/local/lib/python3.10/dist-packages/torchvision/ops/boxes.py:166: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
/usr/local/lib/python3.10/dist-packages/torchvision/ops/boxes.py:168: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
/usr/local/lib/python3.10/dist-packages/torch/__init__.py:1559: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
assert condition, message
/usr/local/lib/python3.10/dist-packages/torchvision/models/detection/transform.py:308: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
torch.tensor(s, dtype=torch.float32, device=boxes.device)
/usr/local/lib/python3.10/dist-packages/torchvision/models/detection/transform.py:309: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
/ torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
/usr/local/lib/python3.10/dist-packages/torch/onnx/symbolic_opset9.py:5858: UserWarning: Exporting aten::index operator of advanced indexing in opset 11 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.
warnings.warn(
run inference on onnx model, make sure outputs are as expected, then clock-it…
import onnxruntime
import numpy as np
= onnxruntime.InferenceSession("rcnn.onnx", providers=["CPUExecutionProvider"])
ort_session # good to make sure inputs are as expected: '[i.name for i in ort_session.get_inputs()]'
# onnx wants numpy tensor not torch tensor
def to_numpy(tensor):
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
# get a prediction. onnx doesn't need a list input like torch model does
= {ort_session.get_inputs()[0].name: to_numpy(image)}
ort_inputs = ort_session.run(None, ort_inputs) ort_outs
# onxx outputs are list of three arrays corresponding to 'boxes', 'labels', and 'scores'
print("onnx out shapes: ", [arr.shape for arr in ort_outs])
# quant model out is tuple of (losses, outputs)
= __[1][0]
torch_outs print("torch out shapes: ", [torch_outs[k].shape for k in torch_outs])
onnx out shapes: [(100, 4), (100,), (100,)]
torch out shapes: [torch.Size([100, 4]), torch.Size([100]), torch.Size([100])]
= %timeit -o ort_session.run(None, ort_inputs)
onnx_time
# sess = onnxruntime.InferenceSession('rcnn.onnx', providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider'])
# onnx_trt_time = %timeit -o sess.run(None, ort_inputs)
1.05 s ± 114 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# more steps for using trtexec which has issues with rcnn input shape
# !sudo apt-get install tensorrt
# !pip install tensorrt
# !ls /usr/src/tensorrt/bin # make sure trtexec is there
# !/usr/src/tensorrt/bin/trtexec --onnx=rcnn.onnx --saveEngine=rcnn_engine_pytorch.trt
use the handy torch-tensorrt
package…
%%capture
!python -m pip install torch-tensorrt
%%capture
= torch.device("cuda")
device rcnn.to(device)
import torch_tensorrt
# need to wrap rcnn inputs in list
= [[torch.randn(3, 200, 200).to("cuda")]] # .half()]
inputs
= torch_tensorrt.compile(
trt_model
deepcopy(rcnn),="torch_compile",
ir# frontend api below complains about input shape
# backend="torch_tensorrt",
=inputs,
inputs={torch.float32}, # {torch.half}
enabled_precisions=True,
debug=20 << 30,
workspace_size=7,
min_block_size={},
torch_executed_ops )
%%capture
# contrary to docs, first run actually compiles model
# https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example.html#torch-compile-resnet
= trt_model(*inputs) outputs
= %timeit -o trt_model(*inputs) trt_time
26.1 ms ± 207 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
import matplotlib.pyplot as plt
= plt.subplots()
fig, ax
= [
runtime 'cpu',
'quant',
'onnx',
'gpu',
'gpu half',
'tensorrt'
]= [
latency
cpu_time.average,
quant_time.average,
onnx_time.average,
gpu_time.average,
gpu_half_time.average,
trt_time.average
]= [round(n, 3) for n in latency]
latency
ax.bar(runtime, latency)
'latency (ms)')
ax.set_ylabel('log')
ax.set_yscale(
plt.show()
… half precision on the GPU is nearly as fast as TensorRT.. with TensorRT can also use half-precision to improve latency even more …