I am having trouble using INetwork::addPlugin(). I want to add a Reshape plugin when building CNN through API rather than caffe parser.
I write a plugin demo as follows. It generates error of segfault at nvinfer1::cudnn::PluginLayer::serializeParams() function. Could anyone help?
#include "NvInfer.h" #include "cuda_runtime_api.h" #include <cassert> #include <cmath> #include <ctime> #include <cstring> #include <fstream> #include <iostream> #include <map> #include <sstream> #include <sys/stat.h> #include <vector> #include <algorithm> #define CHECK(status) \ { \ if (status != 0) \ { \ std::cout << "Cuda failure: " << status; \ abort(); \ } \ } // stuff we know about the network and the input/output blobs static const int INPUT_H = 4; static const int INPUT_W = 4; static const int OUTPUT_SIZE = INPUT_H * INPUT_W; const char* INPUT_BLOB_NAME = "input"; const char* OUTPUT_BLOB_NAME = "output"; using namespace nvinfer1; using namespace std; // Logger for GIE info/warning/errors class Logger : public nvinfer1::ILogger { public: void log(nvinfer1::ILogger::Severity severity, const char* msg) override { // suppress info-level messages if (severity == Severity::kINFO) return; switch (severity) { case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; case Severity::kERROR: std::cerr << "ERROR: "; break; case Severity::kWARNING: std::cerr << "WARNING: "; break; case Severity::kINFO: std::cerr << "INFO: "; break; default: std::cerr << "UNKNOWN: "; break; } std::cerr << msg << std::endl; } }; static Logger gLogger; class Reshape : public IPlugin { public: Reshape() {} Reshape(const void* buffer, size_t size) { assert(size == sizeof(mCopySize)); mCopySize = *reinterpret_cast<const size_t*>(buffer); } int getNbOutputs() const override { cout << "getNbOutputs" << endl; return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override { cout << "getOutputDimensions" << endl; assert(nbInputDims == 1); assert(index == 0); assert(inputs[index].nbDims == 3); return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); } int initialize() override { cout << "initialize" << endl; return 0; } void terminate() override { cout << "terminate" << endl; } size_t getWorkspaceSize(int) const override { cout << "getWorkspaceSize" << endl; return 0; } // currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override { cout << "enqueue" << endl; CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream)); return 0; } size_t getSerializationSize() override { cout << "getSerializationSize" << endl; return sizeof(mCopySize); } void serialize(void* buffer) override { cout << "serialize" << endl; *reinterpret_cast<size_t*>(buffer) = mCopySize; } void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override { cout << "configure" << endl; mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float); } protected: size_t mCopySize; }; // Creat the Engine using only the API and not any parser. ICudaEngine * createMNISTEngine(unsigned int maxBatchSize, IBuilder *builder, DataType dt) { INetworkDefinition* network = builder->createNetwork(); // Create input of shape { 1, 1, 28, 28 } with name referenced by INPUT_BLOB_NAME auto data = network->addInput(INPUT_BLOB_NAME, dt, DimsCHW{ 1, INPUT_H, INPUT_W}); assert(data != nullptr); // Create a scale layer with default power/shift and specified scale parameter. float scale_param = 2.0; Weights power{DataType::kFLOAT, nullptr, 0}; Weights shift{DataType::kFLOAT, nullptr, 0}; Weights scale{DataType::kFLOAT, &scale_param, 1}; auto scale_1 = network->addScale(*data, ScaleMode::kUNIFORM, shift, scale, power); assert(scale_1 != nullptr); auto plugin_in = (scale_1->getOutput(0)); // (const void* buffer, size_t size) size_t *reshape_size = new size_t; Reshape reshape(reshape_size, sizeof(size_t)); auto plugin = network->addPlugin(&plugin_in, 1, reshape); auto res = plugin->getOutput(0); // Add a softmax layer to determine the probability. res->setName(OUTPUT_BLOB_NAME); network->markOutput(*res); // Build the engine builder->setMaxBatchSize(maxBatchSize); builder->setMaxWorkspaceSize(1 << 20); auto engine = builder->buildCudaEngine(*network); // engine->serialize(); // we don't need the network any more network->destroy(); return engine; } void APIToModel(unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with) IHostMemory **modelStream) { // create the builder IBuilder* builder = createInferBuilder(gLogger); // create the model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createMNISTEngine(maxBatchSize, builder, DataType::kFLOAT); assert(engine != nullptr); // serialize the engine, then close everything down (*modelStream) = engine->serialize(); engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // note that indices are guaranteed to be less than IEngine::getNbBindings() int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // create GPU buffers and a stream CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // release the stream and the buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { // create a model using the API directly and serialize it to a stream IHostMemory *modelStream{nullptr}; APIToModel(1, &modelStream); float data[INPUT_H*INPUT_W]; cout << "input: " << endl; for (int i = 0; i < INPUT_H*INPUT_W; i++) { data[i] = i - 10; cout << data[i] << " "; } std::cout << "\n\n"; IRuntime* runtime = createInferRuntime(gLogger); ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr); if (modelStream) modelStream->destroy(); IExecutionContext *context = engine->createExecutionContext(); // run inference float prob[OUTPUT_SIZE]; doInference(*context, data, prob, 1); // destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); cout << "output: " << endl; for (unsigned int i = 0; i < OUTPUT_SIZE; i++) { cout << prob[i] << " "; } std::cout << std::endl; return 0; }