Skip to content

Commit

Permalink
fix(tensorrt): yolox postprocessing in C++
Browse files Browse the repository at this point in the history
Onnx post processing returns wrong results due to a tensorrt bug
  • Loading branch information
Bycob authored and mergify[bot] committed Dec 16, 2021
1 parent bea7cb4 commit 1d781d2
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 37 deletions.
1 change: 1 addition & 0 deletions src/backends/tensorrt/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-models/
76 changes: 76 additions & 0 deletions src/backends/tensorrt/models/yolo.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright (C) 2021 Jolibrain http://www.jolibrain.com

// Author: Louis Jean <[email protected]>

// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 3
// of the License, or (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

#include <vector>
#include <algorithm>

namespace dd
{
namespace yolo_utils
{
/** Convert from format:
* unsorted bbox*4 | objectness | class softmax*n_classes
* to format:
* sorted batch id | class_id | class confidence | bbox * 4*/
static std::vector<float>
parse_yolo_output(const std::vector<float> &model_out, size_t batch_size,
size_t top_k, size_t n_classes, size_t im_width,
size_t im_height)
{
std::vector<float> vals;
vals.reserve(batch_size * top_k * 7);
size_t step = n_classes + 5;
auto batch_it = model_out.begin();

for (size_t batch = 0; batch < batch_size; ++batch)
{
std::vector<std::vector<float>> result;
result.reserve(top_k);
auto end_it = batch_it + top_k * step;

for (; batch_it != end_it; batch_it += step)
{
// get class id & confidence
auto max_batch_it
= std::max_element(batch_it + 5, batch_it + step);
float cls_pred = std::distance(batch_it + 5, max_batch_it);
float prob = *max_batch_it * (*(batch_it + 4));

// convert center, dims to xyxy
float xc = *batch_it, yc = *(batch_it + 1), w = *(batch_it + 2),
h = *(batch_it + 3);
result.push_back(std::vector<float>{
0, cls_pred, prob, (xc - w / 2) / (im_width - 1),
(yc - h / 2) / (im_height - 1),
(xc + w / 2) / (im_width - 1),
(yc + h / 2) / (im_height - 1) });
}

std::sort(result.begin(), result.end(),
[](const std::vector<float> &a,
const std::vector<float> &b) { return a[2] > b[2]; });

for (auto &val : result)
{
vals.insert(vals.end(), val.begin(), val.end());
}
batch_it = end_it;
}
return vals;
}
}
}
53 changes: 34 additions & 19 deletions src/backends/tensorrt/tensorrtlib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <opencv2/core/cuda_stream_accessor.hpp>
#endif
#include "utils/bbox.hpp"
#include "models/yolo.hpp"

namespace dd
{
Expand Down Expand Up @@ -108,6 +109,7 @@ namespace dd
_timeserie = tl._timeserie;
_regression = tl._regression;
_need_nms = tl._need_nms;
_template = tl._template;
_inputIndex = tl._inputIndex;
_outputIndex0 = tl._outputIndex0;
_outputIndex1 = tl._outputIndex1;
Expand Down Expand Up @@ -200,23 +202,21 @@ namespace dd
+ this->_mlmodel._repo);
}

// XXX(louis): this default value should be moved out of trt lib when
// init_mllib will be changed to DTOs
if (ad.has("topk"))
_top_k = ad.get("topk").get<int>();

if (ad.has("template"))
{
std::string tmplate = ad.get("template").get<std::string>();
this->_logger->info("Model template is {}", tmplate);
_template = ad.get("template").get<std::string>();
this->_logger->info("Model template is {}", _template);

if (tmplate == "yolox")
if (_template == "yolox")
{
this->_mltype = "detection";
_need_nms = true;
}
else
throw MLLibBadParamException("Unknown template " + tmplate);
throw MLLibBadParamException("Unknown template " + _template);
}

_builder = std::shared_ptr<nvinfer1::IBuilder>(
Expand Down Expand Up @@ -544,9 +544,9 @@ namespace dd
this->_logger->info("trying to determine number of classes...");
_nclasses = caffe_proto::findNClasses(this->_mlmodel._def, _bbox);
if (_nclasses < 0)
throw MLLibBadParamException(
"failed detecting the number of classes, specify it through "
"API with nclasses");
throw MLLibBadParamException("failed detecting the number of "
"classes, specify it through "
"API with nclasses");
this->_logger->info("found {} classes", _nclasses);
}

Expand Down Expand Up @@ -636,8 +636,7 @@ namespace dd
}
else
{
if (this->_mlmodel._model.find("net_tensorRT.onnx")
!= std::string::npos)
if (this->_mlmodel.is_onnx_source())
_explicit_batch = true;
}

Expand Down Expand Up @@ -671,9 +670,16 @@ namespace dd

if (_bbox)
{
if (_dims.nbDims < 3)
throw MLLibBadParamException(
"Bbox model requires 3 output dimensions, found "
+ std::to_string(_dims.nbDims));

_outputIndex1 = _engine->getBindingIndex("keep_count");
_buffers.resize(3);
_floatOut.resize(_max_batch_size * _top_k * 7);
int det_out_size = _max_batch_size * _top_k * _dims.d[2];
// int det_out_size = _max_batch_size * _top_k * 7;
_floatOut.resize(det_out_size);
_keepCount.resize(_max_batch_size);
if (inputc._bw)
cudaMalloc(&_buffers.data()[_inputIndex],
Expand All @@ -684,7 +690,7 @@ namespace dd
_max_batch_size * 3 * inputc._height
* inputc._width * sizeof(float));
cudaMalloc(&_buffers.data()[_outputIndex0],
_max_batch_size * _top_k * 7 * sizeof(float));
det_out_size * sizeof(float));
cudaMalloc(&_buffers.data()[_outputIndex1],
_max_batch_size * sizeof(int));
}
Expand Down Expand Up @@ -816,7 +822,7 @@ namespace dd
{
cudaMemcpyAsync(_floatOut.data(),
_buffers.data()[_outputIndex0],
num_processed * _top_k * 7 * sizeof(float),
_floatOut.size() * sizeof(float),
cudaMemcpyDeviceToHost, cstream);
cudaMemcpyAsync(_keepCount.data(),
_buffers.data()[_outputIndex1],
Expand All @@ -837,10 +843,10 @@ namespace dd
// GAN/raw output
else if (!extract_layer.empty())
{
cudaMemcpyAsync(
_floatOut.data(), _buffers.data()[_outputIndex0],
num_processed * _floatOut.size() * sizeof(float),
cudaMemcpyDeviceToHost, cstream);
cudaMemcpyAsync(_floatOut.data(),
_buffers.data()[_outputIndex0],
_floatOut.size() * sizeof(float),
cudaMemcpyDeviceToHost, cstream);
cudaStreamSynchronize(cstream);
}
else // classification / regression
Expand Down Expand Up @@ -868,12 +874,21 @@ namespace dd
if (_bbox)
{
int results_height = _top_k;
const int det_size = 7;

// preproc yolox
if (_template == "yolox")
{
_floatOut = yolo_utils::parse_yolo_output(
_floatOut, num_processed, results_height, _nclasses,
inputc._width, inputc._height);
};

const int det_size = 7;
const float *outr = _floatOut.data();

for (int j = 0; j < num_processed; j++)
{

int k = 0;
std::vector<double> probs;
std::vector<std::string> cats;
Expand Down
2 changes: 2 additions & 0 deletions src/backends/tensorrt/tensorrtlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ namespace dd
bool _writeEngine = true;
std::string _arch;
int _gpuid = 0;
std::string
_template; /**< template for models that require specific treatment */

//!< The TensorRT engine used to run the network
std::shared_ptr<nvinfer1::ICudaEngine> _engine = nullptr;
Expand Down
15 changes: 11 additions & 4 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -350,12 +350,19 @@ if (USE_TENSORRT)
"resnet_onnx_trt.tar.gz"
"resnet_onnx_trt"
)
# DOWNLOAD_DATASET(
# "ONNX yolox model"
# "https://deepdetect.com/models/init/desktop/images/detection/yolox_onnx_trt.tar.gz"
# "examples/trt"
# "yolox_onnx_trt.tar.gz"
# "yolox_onnx_trt"
# )
DOWNLOAD_DATASET(
"ONNX yolox model"
"https://deepdetect.com/models/init/desktop/images/detection/yolox_onnx_trt.tar.gz"
"ONNX yolox model without wrapper"
"https://deepdetect.com/models/init/desktop/images/detection/yolox_onnx_trt_nowrap.tar.gz"
"examples/trt"
"yolox_onnx_trt.tar.gz"
"yolox_onnx_trt"
"yolox_onnx_trt_nowrap.tar.gz"
"yolox_onnx_trt_nowrap"
)
DOWNLOAD_DATASET(
"ONNX CycleGAN model"
Expand Down
34 changes: 26 additions & 8 deletions tests/ut-tensorrtapi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static std::string squeez_repo = "../examples/trt/squeezenet_ssd_trt/";
static std::string refinedet_repo = "../examples/trt/faces_512/";
static std::string squeezv1_repo = "../examples/trt/squeezenet_v1/";
static std::string resnet_onnx_repo = "../examples/trt/resnet_onnx_trt/";
static std::string yolox_onnx_repo = "../examples/trt/yolox_onnx_trt/";
static std::string yolox_onnx_repo = "../examples/trt/yolox_onnx_trt_nowrap/";
static std::string cyclegan_onnx_repo
= "../examples/trt/cyclegan_resnet_attn_onnx_trt/";

Expand Down Expand Up @@ -256,7 +256,7 @@ TEST(tensorrtapi, service_predict_bbox_onnx)
+ yolox_onnx_repo
+ "\"},\"parameters\":{\"input\":{\"connector\":\"image\",\"height\":"
"640,\"width\":640,\"rgb\":true},\"mllib\":{\"template\":\"yolox\","
"\"maxBatchSize\":1,\"maxWorkspaceSize\":256,\"gpuid\":0,"
"\"maxBatchSize\":2,\"maxWorkspaceSize\":256,\"gpuid\":0,"
"\"nclasses\":80}}}";
std::string joutstr = japi.jrender(japi.service_create(sname, jstr));
ASSERT_EQ(created_str, joutstr);
Expand All @@ -266,34 +266,52 @@ TEST(tensorrtapi, service_predict_bbox_onnx)
= "{\"service\":\"" + sname
+ "\",\"parameters\":{\"input\":{},\"output\":{\"bbox\":true,"
"\"confidence_threshold\":0.8}},\"data\":[\""
+ resnet_onnx_repo + "cat.jpg\"]}";
+ resnet_onnx_repo + "cat.jpg\",\"" + yolox_onnx_repo + "dog.jpg\"]}";
joutstr = japi.jrender(japi.service_predict(jpredictstr));
JDoc jd;
std::cout << "joutstr=" << joutstr << std::endl;
jd.Parse<rapidjson::kParseNanAndInfFlag>(joutstr.c_str());
ASSERT_TRUE(!jd.HasParseError());
ASSERT_EQ(200, jd["status"]["code"]);
ASSERT_TRUE(jd["body"]["predictions"].IsArray());
ASSERT_EQ(jd["body"]["predictions"].Size(), 2);

uint32_t cat_id = jd["body"]["predictions"][0]["uri"].GetString()
== (resnet_onnx_repo + "cat.jpg")
? 0
: 1;
uint32_t dog_id = 1 - cat_id;

auto &preds = jd["body"]["predictions"][0]["classes"];
auto &preds = jd["body"]["predictions"][cat_id]["classes"];
ASSERT_EQ(preds.Size(), 1);
std::string cl1 = preds[0]["cat"].GetString();
ASSERT_TRUE(cl1 == "14");
ASSERT_TRUE(preds[0]["prob"].GetDouble() > 0.85);
ASSERT_EQ(cl1, "15");
ASSERT_TRUE(preds[0]["prob"].GetDouble() > 0.9);
auto &bbox = preds[0]["bbox"];
ASSERT_TRUE(bbox["xmin"].GetDouble() < 50 && bbox["xmax"].GetDouble() > 200
&& bbox["ymin"].GetDouble() < 50
&& bbox["ymax"].GetDouble() > 200);
// Check confidence threshold
ASSERT_TRUE(preds[preds.Size() - 1]["prob"].GetDouble() >= 0.8);

// Check second pred
auto &preds2 = jd["body"]["predictions"][dog_id]["classes"];
ASSERT_EQ(preds2.Size(), 1);
std::string cl2 = preds2[0]["cat"].GetString();
ASSERT_EQ(cl2, "16");
ASSERT_TRUE(preds2[0]["prob"].GetDouble() > 0.8);
auto &bbox2 = preds[0]["bbox"];
ASSERT_TRUE(bbox2["xmin"].GetDouble() < 50 && bbox2["xmax"].GetDouble() > 200
&& bbox2["ymin"].GetDouble() < 50
&& bbox2["ymax"].GetDouble() > 200);

ASSERT_TRUE(fileops::file_exists(yolox_onnx_repo + "TRTengine_arch"
+ get_trt_archi() + "_bs1"));
+ get_trt_archi() + "_bs2"));
jstr = "{\"clear\":\"lib\"}";
joutstr = japi.jrender(japi.service_delete(sname, jstr));
ASSERT_EQ(ok_str, joutstr);
ASSERT_TRUE(!fileops::file_exists(yolox_onnx_repo + "TRTengine_arch"
+ get_trt_archi() + "_bs1"));
+ get_trt_archi() + "_bs2"));
}

TEST(tensorrtapi, service_predict_gan_onnx)
Expand Down
24 changes: 18 additions & 6 deletions tools/torch/trace_yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def main():
parser.add_argument('--num_classes', type=int, default=80, help="Number of classes of the model")
parser.add_argument('--gpu', type=int, help="GPU id to run on GPU")
parser.add_argument('--to_onnx', action="store_true", help="Export model to onnx")
parser.add_argument('--use_wrapper', action="store_true", help="In case of onnx export, if this option is present, the model will be wrapped so that its output match dede expectations")
parser.add_argument('--top_k', type=int, default=200, help="When exporting to onnx, specify maximum returned prediction count")
parser.add_argument('--batch_size', type=int, default=1, help="When exporting to onnx, batch size of model")
parser.add_argument('--img_width', type=int, default=640, help="Width of the image when exporting with fixed image size")
parser.add_argument('--img_height', type=int, default=640, help="Height of the image when exporting with fixed image size")

args = parser.parse_args()

Expand Down Expand Up @@ -78,18 +83,21 @@ def main():
if args.to_onnx:
model = replace_module(model, nn.SiLU, SiLU)

model = YoloXWrapper_TRT(model)
model = YoloXWrapper_TRT(model, topk = args.top_k, raw_output = not args.use_wrapper)
model.to(device)
model.eval()

filename += ".onnx"
example = get_image_input(1, 640, 640)
example = get_image_input(args.batch_size, args.img_width, args.img_height)
# XXX: dynamic batch size not supported with wrapper
# XXX: dynamic batch size not yet supported in dede as well
dynamic_axes = None # {"input": {0: "batch"}} if not args.use_wrapper else None
torch.onnx.export(
model, example, filename,
export_params=True, verbose=args.verbose,
opset_version=11, do_constant_folding=True,
input_names=["input"], output_names=["detection_out", "keep_count"])
#, dynamic_axes={"input": {0: "batch"}})
input_names=["input"], output_names=["detection_out", "keep_count"],
dynamic_axes = dynamic_axes)
else:
# wrap model
model = YoloXWrapper(model, args.num_classes, postprocess)
Expand Down Expand Up @@ -187,10 +195,11 @@ def forward(self, x, ids = None, bboxes = None, labels = None):

class YoloXWrapper_TRT(torch.nn.Module):

def __init__(self, model, topk=200):
def __init__(self, model, topk=200, raw_output=False):
super(YoloXWrapper_TRT, self).__init__()
self.model = model
self.topk = topk
self.raw_output = raw_output

def to_xyxy(self, boxes):
xyxy_boxes = boxes.new_zeros(boxes.shape)
Expand All @@ -204,8 +213,11 @@ def forward(self, x):
# xmin, ymin, xmax, ymax, objectness, conf cls1, conf cl2...
output = self.model(x)[0]

if self.raw_output:
return output, torch.zeros(output.shape[0])

box_count = output.shape[1]
cls_scores, cls_pred = output[:,:,6:].max(dim=2, keepdim=True)
cls_scores, cls_pred = output[:,:,5:].max(dim=2, keepdim=True)
batch_ids = torch.arange(output.shape[0], device=x.device).view(
-1, 1).repeat(1, output.shape[1]).unsqueeze(2)

Expand Down

0 comments on commit 1d781d2

Please sign in to comment.